Merge pull request #612 from tcely/filter-metadata-response

Filter metadata to avoid storing excess text in the database table
This commit is contained in:
meeb 2025-01-15 01:29:08 +11:00 committed by GitHub
commit 51153f0053
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 206 additions and 3 deletions

View File

@ -19,7 +19,7 @@ from common.utils import clean_filename, clean_emoji
from .youtube import (get_media_info as get_youtube_media_info, from .youtube import (get_media_info as get_youtube_media_info,
download_media as download_youtube_media, download_media as download_youtube_media,
get_channel_image_info as get_youtube_channel_image_info) get_channel_image_info as get_youtube_channel_image_info)
from .utils import seconds_to_timestr, parse_media_format from .utils import seconds_to_timestr, parse_media_format, filter_response
from .matching import (get_best_combined_format, get_best_audio_format, from .matching import (get_best_combined_format, get_best_audio_format,
get_best_video_format) get_best_video_format)
from .mediaservers import PlexMediaServer from .mediaservers import PlexMediaServer
@ -1145,8 +1145,39 @@ class Media(models.Model):
def has_metadata(self): def has_metadata(self):
return self.metadata is not None return self.metadata is not None
@property
def reduce_data(self):
try:
from common.logger import log
from common.utils import json_serial
old_mdl = len(self.metadata or "")
data = json.loads(self.metadata or "")
compact_json = json.dumps(data, separators=(',', ':'), default=json_serial)
filtered_data = filter_response(data, True)
filtered_json = json.dumps(filtered_data, separators=(',', ':'), default=json_serial)
except Exception as e:
log.exception('reduce_data: %s', e)
else:
# log the results of filtering / compacting on metadata size
new_mdl = len(compact_json)
if old_mdl > new_mdl:
delta = old_mdl - new_mdl
log.info(f'{self.key}: metadata compacted by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
new_mdl = len(filtered_json)
if old_mdl > new_mdl:
delta = old_mdl - new_mdl
log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
self.metadata = filtered_json
@property @property
def loaded_metadata(self): def loaded_metadata(self):
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
self.reduce_data
try: try:
data = json.loads(self.metadata) data = json.loads(self.metadata)
if not isinstance(data, dict): if not isinstance(data, dict):

View File

@ -26,7 +26,7 @@ from common.errors import NoMediaException, DownloadFailedException
from common.utils import json_serial from common.utils import json_serial
from .models import Source, Media, MediaServer from .models import Source, Media, MediaServer
from .utils import (get_remote_image, resize_image_to_height, delete_file, from .utils import (get_remote_image, resize_image_to_height, delete_file,
write_text_file) write_text_file, filter_response)
from .filtering import filter_media from .filtering import filter_media
@ -304,7 +304,10 @@ def download_media_metadata(media_id):
return return
source = media.source source = media.source
metadata = media.index_metadata() metadata = media.index_metadata()
media.metadata = json.dumps(metadata, default=json_serial) response = metadata
if getattr(settings, 'SHRINK_NEW_MEDIA_METADATA', False):
response = filter_response(metadata, True)
media.metadata = json.dumps(response, separators=(',', ':'), default=json_serial)
upload_date = media.upload_date upload_date = media.upload_date
# Media must have a valid upload date # Media must have a valid upload date
if upload_date: if upload_date:

View File

@ -18,6 +18,7 @@ from background_task.models import Task
from .models import Source, Media from .models import Source, Media
from .tasks import cleanup_old_media from .tasks import cleanup_old_media
from .filtering import filter_media from .filtering import filter_media
from .utils import filter_response
class FrontEndTestCase(TestCase): class FrontEndTestCase(TestCase):
@ -1709,6 +1710,84 @@ class FormatMatchingTestCase(TestCase):
f'expected {expected_match_result}') f'expected {expected_match_result}')
class ResponseFilteringTestCase(TestCase):
def setUp(self):
# Disable general logging for test case
logging.disable(logging.CRITICAL)
# Add a test source
self.source = Source.objects.create(
source_type=Source.SOURCE_TYPE_YOUTUBE_CHANNEL,
key='testkey',
name='testname',
directory='testdirectory',
index_schedule=3600,
delete_old_media=False,
days_to_keep=14,
source_resolution=Source.SOURCE_RESOLUTION_1080P,
source_vcodec=Source.SOURCE_VCODEC_VP9,
source_acodec=Source.SOURCE_ACODEC_OPUS,
prefer_60fps=False,
prefer_hdr=False,
fallback=Source.FALLBACK_FAIL
)
# Add some media
self.media = Media.objects.create(
key='mediakey',
source=self.source,
metadata='{}'
)
def test_metadata_20230629(self):
self.media.metadata = all_test_metadata['20230629']
self.media.save()
unfiltered = self.media.loaded_metadata
filtered = filter_response(self.media.loaded_metadata)
self.assertIn('formats', unfiltered.keys())
self.assertIn('formats', filtered.keys())
# filtered 'downloader_options'
self.assertIn('downloader_options', unfiltered['formats'][10].keys())
self.assertNotIn('downloader_options', filtered['formats'][10].keys())
# filtered 'http_headers'
self.assertIn('http_headers', unfiltered['formats'][0].keys())
self.assertNotIn('http_headers', filtered['formats'][0].keys())
# did not lose any formats
self.assertEqual(48, len(unfiltered['formats']))
self.assertEqual(48, len(filtered['formats']))
self.assertEqual(len(unfiltered['formats']), len(filtered['formats']))
# did not remove everything with url
self.assertIn('original_url', unfiltered.keys())
self.assertIn('original_url', filtered.keys())
self.assertEqual(unfiltered['original_url'], filtered['original_url'])
# did reduce the size of the metadata
self.assertTrue(len(str(filtered)) < len(str(unfiltered)))
url_keys = []
for format in unfiltered['formats']:
for key in format.keys():
if 'url' in key:
url_keys.append((format['format_id'], key, format[key],))
unfiltered_url_keys = url_keys
self.assertEqual(63, len(unfiltered_url_keys), msg=str(unfiltered_url_keys))
url_keys = []
for format in filtered['formats']:
for key in format.keys():
if 'url' in key:
url_keys.append((format['format_id'], key, format[key],))
filtered_url_keys = url_keys
self.assertEqual(3, len(filtered_url_keys), msg=str(filtered_url_keys))
url_keys = []
for lang_code, captions in filtered['automatic_captions'].items():
for caption in captions:
for key in caption.keys():
if 'url' in key:
url_keys.append((lang_code, caption['ext'], caption[key],))
self.assertEqual(0, len(url_keys), msg=str(url_keys))
class TasksTestCase(TestCase): class TasksTestCase(TestCase):
def setUp(self): def setUp(self):

View File

@ -1,6 +1,7 @@
import os import os
import re import re
import math import math
from copy import deepcopy
from operator import itemgetter from operator import itemgetter
from pathlib import Path from pathlib import Path
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
@ -171,6 +172,95 @@ def normalize_codec(codec_str):
return result return result
def _url_keys(arg_dict, filter_func):
result = {}
for key in arg_dict.keys():
if 'url' in key:
result.update(
{key: filter_func(key=key, url=arg_dict[key])}
)
return result
def _drop_url_keys(arg_dict, key, filter_func):
if key in arg_dict.keys():
for val_dict in arg_dict[key]:
for url_key, remove in _url_keys(val_dict, filter_func).items():
if remove is True:
del val_dict[url_key]
def filter_response(arg_dict, copy_arg=False):
'''
Clean up the response so as to not store useless metadata in the database.
'''
response_dict = arg_dict
# raise an exception for an unexpected argument type
if not isinstance(response_dict, dict):
raise TypeError(f'response_dict must be a dict, got "{type(response_dict)}"')
if copy_arg:
response_dict = deepcopy(arg_dict)
# optimize the empty case
if not response_dict:
return response_dict
# beginning of formats cleanup {{{
# drop urls that expire, or restrict IPs
def drop_format_url(**kwargs):
url = kwargs['url']
return (
url
and '://' in url
and (
'/ip/' in url
or 'ip=' in url
or '/expire/' in url
or 'expire=' in url
)
)
# these format keys are not useful to us
drop_keys = frozenset((
'downloader_options',
'fragments',
'http_headers',
'__needs_testing',
'__working',
))
for key in frozenset(('formats', 'requested_formats',)):
_drop_url_keys(response_dict, key, drop_format_url)
if key in response_dict.keys():
for format in response_dict[key]:
for drop_key in drop_keys:
if drop_key in format.keys():
del format[drop_key]
# end of formats cleanup }}}
# beginning of subtitles cleanup {{{
# drop urls that expire
def drop_subtitles_url(**kwargs):
url = kwargs['url']
return (
url
and '://' in url
and (
'/expire/' in url
or '&expire=' in url
)
)
for key in frozenset(('subtitles', 'automatic_captions',)):
if key in response_dict.keys():
key_dict = response_dict[key]
for lang_code in key_dict:
_drop_url_keys(key_dict, lang_code, drop_subtitles_url)
# end of subtitles cleanup }}}
return response_dict
def parse_media_format(format_dict): def parse_media_format(format_dict):
''' '''
This parser primarily adapts the format dict returned by youtube-dl into a This parser primarily adapts the format dict returned by youtube-dl into a