mirror of
https://github.com/meeb/tubesync.git
synced 2025-06-25 22:46:34 +00:00
Merge pull request #612 from tcely/filter-metadata-response
Filter metadata to avoid storing excess text in the database table
This commit is contained in:
commit
51153f0053
@ -19,7 +19,7 @@ from common.utils import clean_filename, clean_emoji
|
||||
from .youtube import (get_media_info as get_youtube_media_info,
|
||||
download_media as download_youtube_media,
|
||||
get_channel_image_info as get_youtube_channel_image_info)
|
||||
from .utils import seconds_to_timestr, parse_media_format
|
||||
from .utils import seconds_to_timestr, parse_media_format, filter_response
|
||||
from .matching import (get_best_combined_format, get_best_audio_format,
|
||||
get_best_video_format)
|
||||
from .mediaservers import PlexMediaServer
|
||||
@ -1145,8 +1145,39 @@ class Media(models.Model):
|
||||
def has_metadata(self):
|
||||
return self.metadata is not None
|
||||
|
||||
|
||||
@property
|
||||
def reduce_data(self):
|
||||
try:
|
||||
from common.logger import log
|
||||
from common.utils import json_serial
|
||||
|
||||
old_mdl = len(self.metadata or "")
|
||||
data = json.loads(self.metadata or "")
|
||||
compact_json = json.dumps(data, separators=(',', ':'), default=json_serial)
|
||||
|
||||
filtered_data = filter_response(data, True)
|
||||
filtered_json = json.dumps(filtered_data, separators=(',', ':'), default=json_serial)
|
||||
except Exception as e:
|
||||
log.exception('reduce_data: %s', e)
|
||||
else:
|
||||
# log the results of filtering / compacting on metadata size
|
||||
new_mdl = len(compact_json)
|
||||
if old_mdl > new_mdl:
|
||||
delta = old_mdl - new_mdl
|
||||
log.info(f'{self.key}: metadata compacted by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
|
||||
new_mdl = len(filtered_json)
|
||||
if old_mdl > new_mdl:
|
||||
delta = old_mdl - new_mdl
|
||||
log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
|
||||
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
|
||||
self.metadata = filtered_json
|
||||
|
||||
|
||||
@property
|
||||
def loaded_metadata(self):
|
||||
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
|
||||
self.reduce_data
|
||||
try:
|
||||
data = json.loads(self.metadata)
|
||||
if not isinstance(data, dict):
|
||||
|
@ -26,7 +26,7 @@ from common.errors import NoMediaException, DownloadFailedException
|
||||
from common.utils import json_serial
|
||||
from .models import Source, Media, MediaServer
|
||||
from .utils import (get_remote_image, resize_image_to_height, delete_file,
|
||||
write_text_file)
|
||||
write_text_file, filter_response)
|
||||
from .filtering import filter_media
|
||||
|
||||
|
||||
@ -304,7 +304,10 @@ def download_media_metadata(media_id):
|
||||
return
|
||||
source = media.source
|
||||
metadata = media.index_metadata()
|
||||
media.metadata = json.dumps(metadata, default=json_serial)
|
||||
response = metadata
|
||||
if getattr(settings, 'SHRINK_NEW_MEDIA_METADATA', False):
|
||||
response = filter_response(metadata, True)
|
||||
media.metadata = json.dumps(response, separators=(',', ':'), default=json_serial)
|
||||
upload_date = media.upload_date
|
||||
# Media must have a valid upload date
|
||||
if upload_date:
|
||||
|
@ -18,6 +18,7 @@ from background_task.models import Task
|
||||
from .models import Source, Media
|
||||
from .tasks import cleanup_old_media
|
||||
from .filtering import filter_media
|
||||
from .utils import filter_response
|
||||
|
||||
|
||||
class FrontEndTestCase(TestCase):
|
||||
@ -1709,6 +1710,84 @@ class FormatMatchingTestCase(TestCase):
|
||||
f'expected {expected_match_result}')
|
||||
|
||||
|
||||
class ResponseFilteringTestCase(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
# Disable general logging for test case
|
||||
logging.disable(logging.CRITICAL)
|
||||
# Add a test source
|
||||
self.source = Source.objects.create(
|
||||
source_type=Source.SOURCE_TYPE_YOUTUBE_CHANNEL,
|
||||
key='testkey',
|
||||
name='testname',
|
||||
directory='testdirectory',
|
||||
index_schedule=3600,
|
||||
delete_old_media=False,
|
||||
days_to_keep=14,
|
||||
source_resolution=Source.SOURCE_RESOLUTION_1080P,
|
||||
source_vcodec=Source.SOURCE_VCODEC_VP9,
|
||||
source_acodec=Source.SOURCE_ACODEC_OPUS,
|
||||
prefer_60fps=False,
|
||||
prefer_hdr=False,
|
||||
fallback=Source.FALLBACK_FAIL
|
||||
)
|
||||
# Add some media
|
||||
self.media = Media.objects.create(
|
||||
key='mediakey',
|
||||
source=self.source,
|
||||
metadata='{}'
|
||||
)
|
||||
|
||||
def test_metadata_20230629(self):
|
||||
self.media.metadata = all_test_metadata['20230629']
|
||||
self.media.save()
|
||||
|
||||
unfiltered = self.media.loaded_metadata
|
||||
filtered = filter_response(self.media.loaded_metadata)
|
||||
self.assertIn('formats', unfiltered.keys())
|
||||
self.assertIn('formats', filtered.keys())
|
||||
# filtered 'downloader_options'
|
||||
self.assertIn('downloader_options', unfiltered['formats'][10].keys())
|
||||
self.assertNotIn('downloader_options', filtered['formats'][10].keys())
|
||||
# filtered 'http_headers'
|
||||
self.assertIn('http_headers', unfiltered['formats'][0].keys())
|
||||
self.assertNotIn('http_headers', filtered['formats'][0].keys())
|
||||
# did not lose any formats
|
||||
self.assertEqual(48, len(unfiltered['formats']))
|
||||
self.assertEqual(48, len(filtered['formats']))
|
||||
self.assertEqual(len(unfiltered['formats']), len(filtered['formats']))
|
||||
# did not remove everything with url
|
||||
self.assertIn('original_url', unfiltered.keys())
|
||||
self.assertIn('original_url', filtered.keys())
|
||||
self.assertEqual(unfiltered['original_url'], filtered['original_url'])
|
||||
# did reduce the size of the metadata
|
||||
self.assertTrue(len(str(filtered)) < len(str(unfiltered)))
|
||||
|
||||
url_keys = []
|
||||
for format in unfiltered['formats']:
|
||||
for key in format.keys():
|
||||
if 'url' in key:
|
||||
url_keys.append((format['format_id'], key, format[key],))
|
||||
unfiltered_url_keys = url_keys
|
||||
self.assertEqual(63, len(unfiltered_url_keys), msg=str(unfiltered_url_keys))
|
||||
|
||||
url_keys = []
|
||||
for format in filtered['formats']:
|
||||
for key in format.keys():
|
||||
if 'url' in key:
|
||||
url_keys.append((format['format_id'], key, format[key],))
|
||||
filtered_url_keys = url_keys
|
||||
self.assertEqual(3, len(filtered_url_keys), msg=str(filtered_url_keys))
|
||||
|
||||
url_keys = []
|
||||
for lang_code, captions in filtered['automatic_captions'].items():
|
||||
for caption in captions:
|
||||
for key in caption.keys():
|
||||
if 'url' in key:
|
||||
url_keys.append((lang_code, caption['ext'], caption[key],))
|
||||
self.assertEqual(0, len(url_keys), msg=str(url_keys))
|
||||
|
||||
|
||||
class TasksTestCase(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
|
@ -1,6 +1,7 @@
|
||||
import os
|
||||
import re
|
||||
import math
|
||||
from copy import deepcopy
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile
|
||||
@ -171,6 +172,95 @@ def normalize_codec(codec_str):
|
||||
return result
|
||||
|
||||
|
||||
def _url_keys(arg_dict, filter_func):
|
||||
result = {}
|
||||
for key in arg_dict.keys():
|
||||
if 'url' in key:
|
||||
result.update(
|
||||
{key: filter_func(key=key, url=arg_dict[key])}
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _drop_url_keys(arg_dict, key, filter_func):
|
||||
if key in arg_dict.keys():
|
||||
for val_dict in arg_dict[key]:
|
||||
for url_key, remove in _url_keys(val_dict, filter_func).items():
|
||||
if remove is True:
|
||||
del val_dict[url_key]
|
||||
|
||||
|
||||
def filter_response(arg_dict, copy_arg=False):
|
||||
'''
|
||||
Clean up the response so as to not store useless metadata in the database.
|
||||
'''
|
||||
response_dict = arg_dict
|
||||
# raise an exception for an unexpected argument type
|
||||
if not isinstance(response_dict, dict):
|
||||
raise TypeError(f'response_dict must be a dict, got "{type(response_dict)}"')
|
||||
|
||||
if copy_arg:
|
||||
response_dict = deepcopy(arg_dict)
|
||||
|
||||
# optimize the empty case
|
||||
if not response_dict:
|
||||
return response_dict
|
||||
|
||||
# beginning of formats cleanup {{{
|
||||
# drop urls that expire, or restrict IPs
|
||||
def drop_format_url(**kwargs):
|
||||
url = kwargs['url']
|
||||
return (
|
||||
url
|
||||
and '://' in url
|
||||
and (
|
||||
'/ip/' in url
|
||||
or 'ip=' in url
|
||||
or '/expire/' in url
|
||||
or 'expire=' in url
|
||||
)
|
||||
)
|
||||
|
||||
# these format keys are not useful to us
|
||||
drop_keys = frozenset((
|
||||
'downloader_options',
|
||||
'fragments',
|
||||
'http_headers',
|
||||
'__needs_testing',
|
||||
'__working',
|
||||
))
|
||||
for key in frozenset(('formats', 'requested_formats',)):
|
||||
_drop_url_keys(response_dict, key, drop_format_url)
|
||||
if key in response_dict.keys():
|
||||
for format in response_dict[key]:
|
||||
for drop_key in drop_keys:
|
||||
if drop_key in format.keys():
|
||||
del format[drop_key]
|
||||
# end of formats cleanup }}}
|
||||
|
||||
# beginning of subtitles cleanup {{{
|
||||
# drop urls that expire
|
||||
def drop_subtitles_url(**kwargs):
|
||||
url = kwargs['url']
|
||||
return (
|
||||
url
|
||||
and '://' in url
|
||||
and (
|
||||
'/expire/' in url
|
||||
or '&expire=' in url
|
||||
)
|
||||
)
|
||||
|
||||
for key in frozenset(('subtitles', 'automatic_captions',)):
|
||||
if key in response_dict.keys():
|
||||
key_dict = response_dict[key]
|
||||
for lang_code in key_dict:
|
||||
_drop_url_keys(key_dict, lang_code, drop_subtitles_url)
|
||||
# end of subtitles cleanup }}}
|
||||
|
||||
return response_dict
|
||||
|
||||
|
||||
def parse_media_format(format_dict):
|
||||
'''
|
||||
This parser primarily adapts the format dict returned by youtube-dl into a
|
||||
|
Loading…
Reference in New Issue
Block a user