mirror of
https://github.com/meeb/tubesync.git
synced 2025-06-26 06:56:36 +00:00
Merge pull request #612 from tcely/filter-metadata-response
Filter metadata to avoid storing excess text in the database table
This commit is contained in:
commit
51153f0053
@ -19,7 +19,7 @@ from common.utils import clean_filename, clean_emoji
|
|||||||
from .youtube import (get_media_info as get_youtube_media_info,
|
from .youtube import (get_media_info as get_youtube_media_info,
|
||||||
download_media as download_youtube_media,
|
download_media as download_youtube_media,
|
||||||
get_channel_image_info as get_youtube_channel_image_info)
|
get_channel_image_info as get_youtube_channel_image_info)
|
||||||
from .utils import seconds_to_timestr, parse_media_format
|
from .utils import seconds_to_timestr, parse_media_format, filter_response
|
||||||
from .matching import (get_best_combined_format, get_best_audio_format,
|
from .matching import (get_best_combined_format, get_best_audio_format,
|
||||||
get_best_video_format)
|
get_best_video_format)
|
||||||
from .mediaservers import PlexMediaServer
|
from .mediaservers import PlexMediaServer
|
||||||
@ -1145,8 +1145,39 @@ class Media(models.Model):
|
|||||||
def has_metadata(self):
|
def has_metadata(self):
|
||||||
return self.metadata is not None
|
return self.metadata is not None
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def reduce_data(self):
|
||||||
|
try:
|
||||||
|
from common.logger import log
|
||||||
|
from common.utils import json_serial
|
||||||
|
|
||||||
|
old_mdl = len(self.metadata or "")
|
||||||
|
data = json.loads(self.metadata or "")
|
||||||
|
compact_json = json.dumps(data, separators=(',', ':'), default=json_serial)
|
||||||
|
|
||||||
|
filtered_data = filter_response(data, True)
|
||||||
|
filtered_json = json.dumps(filtered_data, separators=(',', ':'), default=json_serial)
|
||||||
|
except Exception as e:
|
||||||
|
log.exception('reduce_data: %s', e)
|
||||||
|
else:
|
||||||
|
# log the results of filtering / compacting on metadata size
|
||||||
|
new_mdl = len(compact_json)
|
||||||
|
if old_mdl > new_mdl:
|
||||||
|
delta = old_mdl - new_mdl
|
||||||
|
log.info(f'{self.key}: metadata compacted by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
|
||||||
|
new_mdl = len(filtered_json)
|
||||||
|
if old_mdl > new_mdl:
|
||||||
|
delta = old_mdl - new_mdl
|
||||||
|
log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
|
||||||
|
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
|
||||||
|
self.metadata = filtered_json
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def loaded_metadata(self):
|
def loaded_metadata(self):
|
||||||
|
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
|
||||||
|
self.reduce_data
|
||||||
try:
|
try:
|
||||||
data = json.loads(self.metadata)
|
data = json.loads(self.metadata)
|
||||||
if not isinstance(data, dict):
|
if not isinstance(data, dict):
|
||||||
|
@ -26,7 +26,7 @@ from common.errors import NoMediaException, DownloadFailedException
|
|||||||
from common.utils import json_serial
|
from common.utils import json_serial
|
||||||
from .models import Source, Media, MediaServer
|
from .models import Source, Media, MediaServer
|
||||||
from .utils import (get_remote_image, resize_image_to_height, delete_file,
|
from .utils import (get_remote_image, resize_image_to_height, delete_file,
|
||||||
write_text_file)
|
write_text_file, filter_response)
|
||||||
from .filtering import filter_media
|
from .filtering import filter_media
|
||||||
|
|
||||||
|
|
||||||
@ -304,7 +304,10 @@ def download_media_metadata(media_id):
|
|||||||
return
|
return
|
||||||
source = media.source
|
source = media.source
|
||||||
metadata = media.index_metadata()
|
metadata = media.index_metadata()
|
||||||
media.metadata = json.dumps(metadata, default=json_serial)
|
response = metadata
|
||||||
|
if getattr(settings, 'SHRINK_NEW_MEDIA_METADATA', False):
|
||||||
|
response = filter_response(metadata, True)
|
||||||
|
media.metadata = json.dumps(response, separators=(',', ':'), default=json_serial)
|
||||||
upload_date = media.upload_date
|
upload_date = media.upload_date
|
||||||
# Media must have a valid upload date
|
# Media must have a valid upload date
|
||||||
if upload_date:
|
if upload_date:
|
||||||
|
@ -18,6 +18,7 @@ from background_task.models import Task
|
|||||||
from .models import Source, Media
|
from .models import Source, Media
|
||||||
from .tasks import cleanup_old_media
|
from .tasks import cleanup_old_media
|
||||||
from .filtering import filter_media
|
from .filtering import filter_media
|
||||||
|
from .utils import filter_response
|
||||||
|
|
||||||
|
|
||||||
class FrontEndTestCase(TestCase):
|
class FrontEndTestCase(TestCase):
|
||||||
@ -1709,6 +1710,84 @@ class FormatMatchingTestCase(TestCase):
|
|||||||
f'expected {expected_match_result}')
|
f'expected {expected_match_result}')
|
||||||
|
|
||||||
|
|
||||||
|
class ResponseFilteringTestCase(TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
# Disable general logging for test case
|
||||||
|
logging.disable(logging.CRITICAL)
|
||||||
|
# Add a test source
|
||||||
|
self.source = Source.objects.create(
|
||||||
|
source_type=Source.SOURCE_TYPE_YOUTUBE_CHANNEL,
|
||||||
|
key='testkey',
|
||||||
|
name='testname',
|
||||||
|
directory='testdirectory',
|
||||||
|
index_schedule=3600,
|
||||||
|
delete_old_media=False,
|
||||||
|
days_to_keep=14,
|
||||||
|
source_resolution=Source.SOURCE_RESOLUTION_1080P,
|
||||||
|
source_vcodec=Source.SOURCE_VCODEC_VP9,
|
||||||
|
source_acodec=Source.SOURCE_ACODEC_OPUS,
|
||||||
|
prefer_60fps=False,
|
||||||
|
prefer_hdr=False,
|
||||||
|
fallback=Source.FALLBACK_FAIL
|
||||||
|
)
|
||||||
|
# Add some media
|
||||||
|
self.media = Media.objects.create(
|
||||||
|
key='mediakey',
|
||||||
|
source=self.source,
|
||||||
|
metadata='{}'
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_metadata_20230629(self):
|
||||||
|
self.media.metadata = all_test_metadata['20230629']
|
||||||
|
self.media.save()
|
||||||
|
|
||||||
|
unfiltered = self.media.loaded_metadata
|
||||||
|
filtered = filter_response(self.media.loaded_metadata)
|
||||||
|
self.assertIn('formats', unfiltered.keys())
|
||||||
|
self.assertIn('formats', filtered.keys())
|
||||||
|
# filtered 'downloader_options'
|
||||||
|
self.assertIn('downloader_options', unfiltered['formats'][10].keys())
|
||||||
|
self.assertNotIn('downloader_options', filtered['formats'][10].keys())
|
||||||
|
# filtered 'http_headers'
|
||||||
|
self.assertIn('http_headers', unfiltered['formats'][0].keys())
|
||||||
|
self.assertNotIn('http_headers', filtered['formats'][0].keys())
|
||||||
|
# did not lose any formats
|
||||||
|
self.assertEqual(48, len(unfiltered['formats']))
|
||||||
|
self.assertEqual(48, len(filtered['formats']))
|
||||||
|
self.assertEqual(len(unfiltered['formats']), len(filtered['formats']))
|
||||||
|
# did not remove everything with url
|
||||||
|
self.assertIn('original_url', unfiltered.keys())
|
||||||
|
self.assertIn('original_url', filtered.keys())
|
||||||
|
self.assertEqual(unfiltered['original_url'], filtered['original_url'])
|
||||||
|
# did reduce the size of the metadata
|
||||||
|
self.assertTrue(len(str(filtered)) < len(str(unfiltered)))
|
||||||
|
|
||||||
|
url_keys = []
|
||||||
|
for format in unfiltered['formats']:
|
||||||
|
for key in format.keys():
|
||||||
|
if 'url' in key:
|
||||||
|
url_keys.append((format['format_id'], key, format[key],))
|
||||||
|
unfiltered_url_keys = url_keys
|
||||||
|
self.assertEqual(63, len(unfiltered_url_keys), msg=str(unfiltered_url_keys))
|
||||||
|
|
||||||
|
url_keys = []
|
||||||
|
for format in filtered['formats']:
|
||||||
|
for key in format.keys():
|
||||||
|
if 'url' in key:
|
||||||
|
url_keys.append((format['format_id'], key, format[key],))
|
||||||
|
filtered_url_keys = url_keys
|
||||||
|
self.assertEqual(3, len(filtered_url_keys), msg=str(filtered_url_keys))
|
||||||
|
|
||||||
|
url_keys = []
|
||||||
|
for lang_code, captions in filtered['automatic_captions'].items():
|
||||||
|
for caption in captions:
|
||||||
|
for key in caption.keys():
|
||||||
|
if 'url' in key:
|
||||||
|
url_keys.append((lang_code, caption['ext'], caption[key],))
|
||||||
|
self.assertEqual(0, len(url_keys), msg=str(url_keys))
|
||||||
|
|
||||||
|
|
||||||
class TasksTestCase(TestCase):
|
class TasksTestCase(TestCase):
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import math
|
import math
|
||||||
|
from copy import deepcopy
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import NamedTemporaryFile
|
from tempfile import NamedTemporaryFile
|
||||||
@ -171,6 +172,95 @@ def normalize_codec(codec_str):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _url_keys(arg_dict, filter_func):
|
||||||
|
result = {}
|
||||||
|
for key in arg_dict.keys():
|
||||||
|
if 'url' in key:
|
||||||
|
result.update(
|
||||||
|
{key: filter_func(key=key, url=arg_dict[key])}
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _drop_url_keys(arg_dict, key, filter_func):
|
||||||
|
if key in arg_dict.keys():
|
||||||
|
for val_dict in arg_dict[key]:
|
||||||
|
for url_key, remove in _url_keys(val_dict, filter_func).items():
|
||||||
|
if remove is True:
|
||||||
|
del val_dict[url_key]
|
||||||
|
|
||||||
|
|
||||||
|
def filter_response(arg_dict, copy_arg=False):
|
||||||
|
'''
|
||||||
|
Clean up the response so as to not store useless metadata in the database.
|
||||||
|
'''
|
||||||
|
response_dict = arg_dict
|
||||||
|
# raise an exception for an unexpected argument type
|
||||||
|
if not isinstance(response_dict, dict):
|
||||||
|
raise TypeError(f'response_dict must be a dict, got "{type(response_dict)}"')
|
||||||
|
|
||||||
|
if copy_arg:
|
||||||
|
response_dict = deepcopy(arg_dict)
|
||||||
|
|
||||||
|
# optimize the empty case
|
||||||
|
if not response_dict:
|
||||||
|
return response_dict
|
||||||
|
|
||||||
|
# beginning of formats cleanup {{{
|
||||||
|
# drop urls that expire, or restrict IPs
|
||||||
|
def drop_format_url(**kwargs):
|
||||||
|
url = kwargs['url']
|
||||||
|
return (
|
||||||
|
url
|
||||||
|
and '://' in url
|
||||||
|
and (
|
||||||
|
'/ip/' in url
|
||||||
|
or 'ip=' in url
|
||||||
|
or '/expire/' in url
|
||||||
|
or 'expire=' in url
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# these format keys are not useful to us
|
||||||
|
drop_keys = frozenset((
|
||||||
|
'downloader_options',
|
||||||
|
'fragments',
|
||||||
|
'http_headers',
|
||||||
|
'__needs_testing',
|
||||||
|
'__working',
|
||||||
|
))
|
||||||
|
for key in frozenset(('formats', 'requested_formats',)):
|
||||||
|
_drop_url_keys(response_dict, key, drop_format_url)
|
||||||
|
if key in response_dict.keys():
|
||||||
|
for format in response_dict[key]:
|
||||||
|
for drop_key in drop_keys:
|
||||||
|
if drop_key in format.keys():
|
||||||
|
del format[drop_key]
|
||||||
|
# end of formats cleanup }}}
|
||||||
|
|
||||||
|
# beginning of subtitles cleanup {{{
|
||||||
|
# drop urls that expire
|
||||||
|
def drop_subtitles_url(**kwargs):
|
||||||
|
url = kwargs['url']
|
||||||
|
return (
|
||||||
|
url
|
||||||
|
and '://' in url
|
||||||
|
and (
|
||||||
|
'/expire/' in url
|
||||||
|
or '&expire=' in url
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
for key in frozenset(('subtitles', 'automatic_captions',)):
|
||||||
|
if key in response_dict.keys():
|
||||||
|
key_dict = response_dict[key]
|
||||||
|
for lang_code in key_dict:
|
||||||
|
_drop_url_keys(key_dict, lang_code, drop_subtitles_url)
|
||||||
|
# end of subtitles cleanup }}}
|
||||||
|
|
||||||
|
return response_dict
|
||||||
|
|
||||||
|
|
||||||
def parse_media_format(format_dict):
|
def parse_media_format(format_dict):
|
||||||
'''
|
'''
|
||||||
This parser primarily adapts the format dict returned by youtube-dl into a
|
This parser primarily adapts the format dict returned by youtube-dl into a
|
||||||
|
Loading…
Reference in New Issue
Block a user