Merge pull request #612 from tcely/filter-metadata-response

Filter metadata to avoid storing excess text in the database table
This commit is contained in:
meeb 2025-01-15 01:29:08 +11:00 committed by GitHub
commit 51153f0053
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 206 additions and 3 deletions

View File

@ -19,7 +19,7 @@ from common.utils import clean_filename, clean_emoji
from .youtube import (get_media_info as get_youtube_media_info,
download_media as download_youtube_media,
get_channel_image_info as get_youtube_channel_image_info)
from .utils import seconds_to_timestr, parse_media_format
from .utils import seconds_to_timestr, parse_media_format, filter_response
from .matching import (get_best_combined_format, get_best_audio_format,
get_best_video_format)
from .mediaservers import PlexMediaServer
@ -1145,8 +1145,39 @@ class Media(models.Model):
def has_metadata(self):
return self.metadata is not None
@property
def reduce_data(self):
try:
from common.logger import log
from common.utils import json_serial
old_mdl = len(self.metadata or "")
data = json.loads(self.metadata or "")
compact_json = json.dumps(data, separators=(',', ':'), default=json_serial)
filtered_data = filter_response(data, True)
filtered_json = json.dumps(filtered_data, separators=(',', ':'), default=json_serial)
except Exception as e:
log.exception('reduce_data: %s', e)
else:
# log the results of filtering / compacting on metadata size
new_mdl = len(compact_json)
if old_mdl > new_mdl:
delta = old_mdl - new_mdl
log.info(f'{self.key}: metadata compacted by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
new_mdl = len(filtered_json)
if old_mdl > new_mdl:
delta = old_mdl - new_mdl
log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
self.metadata = filtered_json
@property
def loaded_metadata(self):
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
self.reduce_data
try:
data = json.loads(self.metadata)
if not isinstance(data, dict):

View File

@ -26,7 +26,7 @@ from common.errors import NoMediaException, DownloadFailedException
from common.utils import json_serial
from .models import Source, Media, MediaServer
from .utils import (get_remote_image, resize_image_to_height, delete_file,
write_text_file)
write_text_file, filter_response)
from .filtering import filter_media
@ -304,7 +304,10 @@ def download_media_metadata(media_id):
return
source = media.source
metadata = media.index_metadata()
media.metadata = json.dumps(metadata, default=json_serial)
response = metadata
if getattr(settings, 'SHRINK_NEW_MEDIA_METADATA', False):
response = filter_response(metadata, True)
media.metadata = json.dumps(response, separators=(',', ':'), default=json_serial)
upload_date = media.upload_date
# Media must have a valid upload date
if upload_date:

View File

@ -18,6 +18,7 @@ from background_task.models import Task
from .models import Source, Media
from .tasks import cleanup_old_media
from .filtering import filter_media
from .utils import filter_response
class FrontEndTestCase(TestCase):
@ -1709,6 +1710,84 @@ class FormatMatchingTestCase(TestCase):
f'expected {expected_match_result}')
class ResponseFilteringTestCase(TestCase):
def setUp(self):
# Disable general logging for test case
logging.disable(logging.CRITICAL)
# Add a test source
self.source = Source.objects.create(
source_type=Source.SOURCE_TYPE_YOUTUBE_CHANNEL,
key='testkey',
name='testname',
directory='testdirectory',
index_schedule=3600,
delete_old_media=False,
days_to_keep=14,
source_resolution=Source.SOURCE_RESOLUTION_1080P,
source_vcodec=Source.SOURCE_VCODEC_VP9,
source_acodec=Source.SOURCE_ACODEC_OPUS,
prefer_60fps=False,
prefer_hdr=False,
fallback=Source.FALLBACK_FAIL
)
# Add some media
self.media = Media.objects.create(
key='mediakey',
source=self.source,
metadata='{}'
)
def test_metadata_20230629(self):
self.media.metadata = all_test_metadata['20230629']
self.media.save()
unfiltered = self.media.loaded_metadata
filtered = filter_response(self.media.loaded_metadata)
self.assertIn('formats', unfiltered.keys())
self.assertIn('formats', filtered.keys())
# filtered 'downloader_options'
self.assertIn('downloader_options', unfiltered['formats'][10].keys())
self.assertNotIn('downloader_options', filtered['formats'][10].keys())
# filtered 'http_headers'
self.assertIn('http_headers', unfiltered['formats'][0].keys())
self.assertNotIn('http_headers', filtered['formats'][0].keys())
# did not lose any formats
self.assertEqual(48, len(unfiltered['formats']))
self.assertEqual(48, len(filtered['formats']))
self.assertEqual(len(unfiltered['formats']), len(filtered['formats']))
# did not remove everything with url
self.assertIn('original_url', unfiltered.keys())
self.assertIn('original_url', filtered.keys())
self.assertEqual(unfiltered['original_url'], filtered['original_url'])
# did reduce the size of the metadata
self.assertTrue(len(str(filtered)) < len(str(unfiltered)))
url_keys = []
for format in unfiltered['formats']:
for key in format.keys():
if 'url' in key:
url_keys.append((format['format_id'], key, format[key],))
unfiltered_url_keys = url_keys
self.assertEqual(63, len(unfiltered_url_keys), msg=str(unfiltered_url_keys))
url_keys = []
for format in filtered['formats']:
for key in format.keys():
if 'url' in key:
url_keys.append((format['format_id'], key, format[key],))
filtered_url_keys = url_keys
self.assertEqual(3, len(filtered_url_keys), msg=str(filtered_url_keys))
url_keys = []
for lang_code, captions in filtered['automatic_captions'].items():
for caption in captions:
for key in caption.keys():
if 'url' in key:
url_keys.append((lang_code, caption['ext'], caption[key],))
self.assertEqual(0, len(url_keys), msg=str(url_keys))
class TasksTestCase(TestCase):
def setUp(self):

View File

@ -1,6 +1,7 @@
import os
import re
import math
from copy import deepcopy
from operator import itemgetter
from pathlib import Path
from tempfile import NamedTemporaryFile
@ -171,6 +172,95 @@ def normalize_codec(codec_str):
return result
def _url_keys(arg_dict, filter_func):
result = {}
for key in arg_dict.keys():
if 'url' in key:
result.update(
{key: filter_func(key=key, url=arg_dict[key])}
)
return result
def _drop_url_keys(arg_dict, key, filter_func):
if key in arg_dict.keys():
for val_dict in arg_dict[key]:
for url_key, remove in _url_keys(val_dict, filter_func).items():
if remove is True:
del val_dict[url_key]
def filter_response(arg_dict, copy_arg=False):
'''
Clean up the response so as to not store useless metadata in the database.
'''
response_dict = arg_dict
# raise an exception for an unexpected argument type
if not isinstance(response_dict, dict):
raise TypeError(f'response_dict must be a dict, got "{type(response_dict)}"')
if copy_arg:
response_dict = deepcopy(arg_dict)
# optimize the empty case
if not response_dict:
return response_dict
# beginning of formats cleanup {{{
# drop urls that expire, or restrict IPs
def drop_format_url(**kwargs):
url = kwargs['url']
return (
url
and '://' in url
and (
'/ip/' in url
or 'ip=' in url
or '/expire/' in url
or 'expire=' in url
)
)
# these format keys are not useful to us
drop_keys = frozenset((
'downloader_options',
'fragments',
'http_headers',
'__needs_testing',
'__working',
))
for key in frozenset(('formats', 'requested_formats',)):
_drop_url_keys(response_dict, key, drop_format_url)
if key in response_dict.keys():
for format in response_dict[key]:
for drop_key in drop_keys:
if drop_key in format.keys():
del format[drop_key]
# end of formats cleanup }}}
# beginning of subtitles cleanup {{{
# drop urls that expire
def drop_subtitles_url(**kwargs):
url = kwargs['url']
return (
url
and '://' in url
and (
'/expire/' in url
or '&expire=' in url
)
)
for key in frozenset(('subtitles', 'automatic_captions',)):
if key in response_dict.keys():
key_dict = response_dict[key]
for lang_code in key_dict:
_drop_url_keys(key_dict, lang_code, drop_subtitles_url)
# end of subtitles cleanup }}}
return response_dict
def parse_media_format(format_dict):
'''
This parser primarily adapts the format dict returned by youtube-dl into a