From 8c22b6c99efb464dfb450707440c6162f04b7b46 Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 00:05:57 -0500 Subject: [PATCH 01/26] Add response filtering These functions aren't being used yet, they will be tested against my database before that happens. --- tubesync/sync/utils.py | 62 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index 3e29fe3f..e44cef1f 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -170,6 +170,68 @@ def normalize_codec(codec_str): return result +def _url_keys(arg_dict, filter_func): + result = {} + for key in arg_dict.keys(): + if 'url' in key: + result.update( + {key: (key, filter_func(key=key, url=arg_dict[key]),)} + ) + return result + + +def _drop_url_keys(arg_dict, key, filter_func): + if key in arg_dict.keys(): + for val_dict in arg_dict[key]: + for url_key in _url_keys(val_dict, filter_func): + if url_key[1] is True: + del val_dict[url_key[0]] + + +def filter_response(response_dict): + ''' + Clean up the response so as to not store useless metadata in the database. + ''' + # raise an exception for an unexpected argument type + if not isinstance(filedata, dict): + raise TypeError(f'filedata must be a dict, got "{type(filedata)}"') + # optimize the empty case + if not response_dict: + return response_dict + + # beginning of formats cleanup {{{ + # drop urls that expire, or restrict IPs + def drop_format_url(**kwargs): + url = kwargs['url'] + return ( + url + and '://' in url + and ( + '/ip/' in url + or '/expire/' in url + ) + ) + + _drop_url_keys(response_dict, 'formats', drop_format_url) + _drop_url_keys(response_dict, 'requested_formats', drop_format_url) + # end of formats cleanup }}} + + # beginning of automatic_captions cleanup {{{ + # drop urls that expire, or restrict IPs + def drop_auto_caption_url(**kwargs): + url = kwargs['url'] + return ( + url + and '://' in url + and '&expire=' in url + ) + + _drop_url_keys(response_dict, 'automatic_captions', drop_auto_caption_url) + # end of automatic_captions cleanup }}} + + return response_dict + + def parse_media_format(format_dict): ''' This parser primarily adapts the format dict returned by youtube-dl into a From 63fa97cc5842af7805c3efb1c8d58971b096893d Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 00:43:59 -0500 Subject: [PATCH 02/26] More compact JSON The software doesn't need an extra space per key. --- tubesync/sync/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubesync/sync/tasks.py b/tubesync/sync/tasks.py index 3df651ba..080dff6d 100644 --- a/tubesync/sync/tasks.py +++ b/tubesync/sync/tasks.py @@ -304,7 +304,7 @@ def download_media_metadata(media_id): return source = media.source metadata = media.index_metadata() - media.metadata = json.dumps(metadata, default=json_serial) + media.metadata = json.dumps(metadata, separators=(',', ':'), default=json_serial) upload_date = media.upload_date # Media must have a valid upload date if upload_date: From 8c31720bf707b0b12713af0e8a5a356f3bc6255d Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 01:33:06 -0500 Subject: [PATCH 03/26] Log the reduction of metadata length --- tubesync/sync/models.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index 2037492d..7ae68729 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -19,7 +19,7 @@ from common.utils import clean_filename, clean_emoji from .youtube import (get_media_info as get_youtube_media_info, download_media as download_youtube_media, get_channel_image_info as get_youtube_channel_image_info) -from .utils import seconds_to_timestr, parse_media_format +from .utils import seconds_to_timestr, parse_media_format, filter_response from .matching import (get_best_combined_format, get_best_audio_format, get_best_video_format) from .mediaservers import PlexMediaServer @@ -1143,12 +1143,27 @@ class Media(models.Model): def has_metadata(self): return self.metadata is not None + + def reduce_data(self, data): + from common.logger import log + from common.utils import json_serial + # log the results of filtering / compacting on metadata size + filtered_data = filter_response(data) + compact_metadata = json.dumps(filtered_data, separators=(',', ':'), default=json_serial) + old_mdl = len(self.metadata) + new_mdl = len(compact_metadata) + if old_mdl > new_mdl: + delta = old_mdl - new_mdl + log.info(f'{self.key}: metadata reduced by {delta,} characters ({old_mdl,} -> {new_mdl,})') + + @property def loaded_metadata(self): try: data = json.loads(self.metadata) if not isinstance(data, dict): return {} + self.reduce_data(data) return data except Exception as e: return {} From 25d2ff680270aa9e4188233cba3770cd9dc5275e Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 02:12:22 -0500 Subject: [PATCH 04/26] Don't reduce the actual data yet --- tubesync/sync/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index 7ae68729..44f24dfb 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -1163,7 +1163,7 @@ class Media(models.Model): data = json.loads(self.metadata) if not isinstance(data, dict): return {} - self.reduce_data(data) + self.reduce_data(json.loads(self.metadata)) return data except Exception as e: return {} From 2f34fff7133754c05d348d50e43442a481c8adfc Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 02:55:05 -0500 Subject: [PATCH 05/26] Fixes from testing The `automatic_captions` has a layer for language codes that I didn't account for. The type checking was copied and I didn't adjust for the arguments in this function. --- tubesync/sync/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index 162146eb..b85abaab 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -194,8 +194,8 @@ def filter_response(response_dict): Clean up the response so as to not store useless metadata in the database. ''' # raise an exception for an unexpected argument type - if not isinstance(filedata, dict): - raise TypeError(f'filedata must be a dict, got "{type(filedata)}"') + if not isinstance(response_dict, dict): + raise TypeError(f'response_dict must be a dict, got "{type(response_dict)}"') # optimize the empty case if not response_dict: return response_dict @@ -227,7 +227,11 @@ def filter_response(response_dict): and '&expire=' in url ) - _drop_url_keys(response_dict, 'automatic_captions', drop_auto_caption_url) + ac_key = 'automatic_captions' + if ac_key in response_dict.keys(): + ac_dict = response_dict[ac_key] + for lang_code in ac_dict: + _drop_url_keys(ac_dict, lang_code, drop_auto_caption_url) # end of automatic_captions cleanup }}} return response_dict From 9a4101a0a147f3fe0ee91c13197a077f1f27cd3e Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 03:18:39 -0500 Subject: [PATCH 06/26] Fix formatting --- tubesync/sync/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index 44f24dfb..077a8283 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -1154,7 +1154,7 @@ class Media(models.Model): new_mdl = len(compact_metadata) if old_mdl > new_mdl: delta = old_mdl - new_mdl - log.info(f'{self.key}: metadata reduced by {delta,} characters ({old_mdl,} -> {new_mdl,})') + log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})') @property From db25fa80294e035b1742fac2e044d2ff7de27464 Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 03:35:58 -0500 Subject: [PATCH 07/26] Adjusted comment --- tubesync/sync/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index b85abaab..108cd757 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -218,7 +218,7 @@ def filter_response(response_dict): # end of formats cleanup }}} # beginning of automatic_captions cleanup {{{ - # drop urls that expire, or restrict IPs + # drop urls that expire def drop_auto_caption_url(**kwargs): url = kwargs['url'] return ( From 431de2e0dfa606d5a725a475159afe5fe370a251 Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 04:11:14 -0500 Subject: [PATCH 08/26] Loop over a set of keys for each URL type --- tubesync/sync/utils.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index 108cd757..f66348b4 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -213,13 +213,13 @@ def filter_response(response_dict): ) ) - _drop_url_keys(response_dict, 'formats', drop_format_url) - _drop_url_keys(response_dict, 'requested_formats', drop_format_url) + for key in frozenset(('formats', 'requested_formats',)): + _drop_url_keys(response_dict, key, drop_format_url) # end of formats cleanup }}} - # beginning of automatic_captions cleanup {{{ + # beginning of subtitles cleanup {{{ # drop urls that expire - def drop_auto_caption_url(**kwargs): + def drop_subtitles_url(**kwargs): url = kwargs['url'] return ( url @@ -227,12 +227,13 @@ def filter_response(response_dict): and '&expire=' in url ) - ac_key = 'automatic_captions' - if ac_key in response_dict.keys(): - ac_dict = response_dict[ac_key] - for lang_code in ac_dict: - _drop_url_keys(ac_dict, lang_code, drop_auto_caption_url) - # end of automatic_captions cleanup }}} + # beginning of automatic_captions cleanup {{{ + for key in frozenset(('subtitles', 'automatic_captions',)): + if key in response_dict.keys(): + key_dict = response_dict[key] + for lang_code in key_dict: + _drop_url_keys(key_dict, lang_code, drop_subtitles_url) + # end of subtitles cleanup }}} return response_dict From 7b8d11791d9725191146304f612ae7e2f7d3d0ec Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 05:39:50 -0500 Subject: [PATCH 09/26] Drop keys from formats that cannot be useful --- tubesync/sync/utils.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index f66348b4..8e98857e 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -213,8 +213,20 @@ def filter_response(response_dict): ) ) + # these format keys are not useful to us + drop_keys = frozenset(( + 'downloader_options', + 'fragments', + 'http_headers', + '__needs_testing', + '__working', + )) for key in frozenset(('formats', 'requested_formats',)): _drop_url_keys(response_dict, key, drop_format_url) + if key in response_dict.keys(): + for format in response_dict[key]: + for drop_key in drop_keys: + del format[drop_key] # end of formats cleanup }}} # beginning of subtitles cleanup {{{ @@ -227,7 +239,6 @@ def filter_response(response_dict): and '&expire=' in url ) - # beginning of automatic_captions cleanup {{{ for key in frozenset(('subtitles', 'automatic_captions',)): if key in response_dict.keys(): key_dict = response_dict[key] From c7457e94ac1f27c04f912a086b9cc766f4ab5882 Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 05:58:50 -0500 Subject: [PATCH 10/26] Check that the drop_key exists --- tubesync/sync/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index 8e98857e..f73e243b 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -226,7 +226,8 @@ def filter_response(response_dict): if key in response_dict.keys(): for format in response_dict[key]: for drop_key in drop_keys: - del format[drop_key] + if drop_key in format.keys(): + del format[drop_key] # end of formats cleanup }}} # beginning of subtitles cleanup {{{ From 2d85bcbe14c0701782d5c76b0cb36116be193d08 Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 06:20:01 -0500 Subject: [PATCH 11/26] Use a distinct try to log errors --- tubesync/sync/models.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index 077a8283..6bcac984 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -1159,11 +1159,17 @@ class Media(models.Model): @property def loaded_metadata(self): + from common.logger import log + try: + self.reduce_data(json.loads(self.metadata)) + except Exception as e: + log.error(f'reduce_data: {e.msg}') + pass + try: data = json.loads(self.metadata) if not isinstance(data, dict): return {} - self.reduce_data(json.loads(self.metadata)) return data except Exception as e: return {} From 8ac5b36eee9a504d0f0b5a9092c5120fa7f8ecbf Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 06:38:56 -0500 Subject: [PATCH 12/26] Use the exception function for traceback --- tubesync/sync/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index 6bcac984..54fcdaa6 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -1163,7 +1163,7 @@ class Media(models.Model): try: self.reduce_data(json.loads(self.metadata)) except Exception as e: - log.error(f'reduce_data: {e.msg}') + log.exception('reduce_data: %s', e) pass try: From 779370122847bb24484181834a299f7e3f41ed1f Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 13:01:06 -0500 Subject: [PATCH 13/26] Simplify results from _url_keys Also, name the tuple values when using the results. --- tubesync/sync/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index f73e243b..170b2a51 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -176,7 +176,7 @@ def _url_keys(arg_dict, filter_func): for key in arg_dict.keys(): if 'url' in key: result.update( - {key: (key, filter_func(key=key, url=arg_dict[key]),)} + {key: (filter_func(key=key, url=arg_dict[key]),)} ) return result @@ -184,9 +184,9 @@ def _url_keys(arg_dict, filter_func): def _drop_url_keys(arg_dict, key, filter_func): if key in arg_dict.keys(): for val_dict in arg_dict[key]: - for url_key in _url_keys(val_dict, filter_func): - if url_key[1] is True: - del val_dict[url_key[0]] + for url_key, remove in _url_keys(val_dict, filter_func).items(): + if remove is True: + del val_dict[url_key] def filter_response(response_dict): From 1c432ccce127439bc722e4d0727d545794d51e4e Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 7 Jan 2025 13:49:58 -0500 Subject: [PATCH 14/26] Some formats are using a different URL --- tubesync/sync/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index 170b2a51..14e7505f 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -176,7 +176,7 @@ def _url_keys(arg_dict, filter_func): for key in arg_dict.keys(): if 'url' in key: result.update( - {key: (filter_func(key=key, url=arg_dict[key]),)} + {key: filter_func(key=key, url=arg_dict[key])} ) return result @@ -209,7 +209,9 @@ def filter_response(response_dict): and '://' in url and ( '/ip/' in url + or 'ip=' in url or '/expire/' in url + or 'expire=' in url ) ) From d35f52f8acb07c30f81c855a855b63d284dbaedf Mon Sep 17 00:00:00 2001 From: tcely Date: Wed, 8 Jan 2025 11:31:23 -0500 Subject: [PATCH 15/26] Drop /expire/ URLs from automatic_captions too --- tubesync/sync/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index 14e7505f..b424528b 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -239,7 +239,10 @@ def filter_response(response_dict): return ( url and '://' in url - and '&expire=' in url + and ( + '/expire/' in url + or '&expire=' in url + ) ) for key in frozenset(('subtitles', 'automatic_captions',)): From ad10bcfa61af480fd9be9b3f7a97baeba18e033d Mon Sep 17 00:00:00 2001 From: tcely Date: Wed, 8 Jan 2025 22:48:23 -0500 Subject: [PATCH 16/26] Log both compacted and reduced sizes --- tubesync/sync/models.py | 43 ++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index 54fcdaa6..76dea0b1 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -1144,28 +1144,35 @@ class Media(models.Model): return self.metadata is not None - def reduce_data(self, data): - from common.logger import log - from common.utils import json_serial - # log the results of filtering / compacting on metadata size - filtered_data = filter_response(data) - compact_metadata = json.dumps(filtered_data, separators=(',', ':'), default=json_serial) - old_mdl = len(self.metadata) - new_mdl = len(compact_metadata) - if old_mdl > new_mdl: - delta = old_mdl - new_mdl - log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})') + @property + def reduce_data(self): + try: + from common.logger import log + from common.utils import json_serial + + old_mdl = len(self.metadata or "") + data = json.loads(self.metadata or "") + compact_data = json.dumps(data, separators=(',', ':'), default=json_serial) + + filtered_data = filter_response(data) + filtered_json = json.dumps(filtered_data, separators=(',', ':'), default=json_serial) + except Exception as e: + log.exception('reduce_data: %s', e) + else: + # log the results of filtering / compacting on metadata size + new_mdl = len(compact_data) + if old_mdl > new_mdl: + delta = old_mdl - new_mdl + log.info(f'{self.key}: metadata compacted by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})') + new_mdl = len(filtered_json) + if old_mdl > new_mdl: + delta = old_mdl - new_mdl + log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})') @property def loaded_metadata(self): - from common.logger import log - try: - self.reduce_data(json.loads(self.metadata)) - except Exception as e: - log.exception('reduce_data: %s', e) - pass - + self.reduce_data try: data = json.loads(self.metadata) if not isinstance(data, dict): From 100382f66fea8b8dd27532932f23f4160d354401 Mon Sep 17 00:00:00 2001 From: tcely Date: Thu, 9 Jan 2025 09:28:58 -0500 Subject: [PATCH 17/26] Rename compact_data to compact_json This was misleading because the data dict becomes a JSON string. --- tubesync/sync/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index 76dea0b1..67453f03 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -1152,7 +1152,7 @@ class Media(models.Model): old_mdl = len(self.metadata or "") data = json.loads(self.metadata or "") - compact_data = json.dumps(data, separators=(',', ':'), default=json_serial) + compact_json = json.dumps(data, separators=(',', ':'), default=json_serial) filtered_data = filter_response(data) filtered_json = json.dumps(filtered_data, separators=(',', ':'), default=json_serial) @@ -1160,7 +1160,7 @@ class Media(models.Model): log.exception('reduce_data: %s', e) else: # log the results of filtering / compacting on metadata size - new_mdl = len(compact_data) + new_mdl = len(compact_json) if old_mdl > new_mdl: delta = old_mdl - new_mdl log.info(f'{self.key}: metadata compacted by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})') From 682a53da34d18d777e58e6080df4390f44519686 Mon Sep 17 00:00:00 2001 From: tcely Date: Thu, 9 Jan 2025 10:17:37 -0500 Subject: [PATCH 18/26] Add a filter_response test First, only check that changes did happen. --- tubesync/sync/tests.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tubesync/sync/tests.py b/tubesync/sync/tests.py index 8f0de6ef..935ad569 100644 --- a/tubesync/sync/tests.py +++ b/tubesync/sync/tests.py @@ -18,6 +18,7 @@ from background_task.models import Task from .models import Source, Media from .tasks import cleanup_old_media from .filtering import filter_media +from .utils import filter_response class FrontEndTestCase(TestCase): @@ -1709,6 +1710,43 @@ class FormatMatchingTestCase(TestCase): f'expected {expected_match_result}') +class ResponseFilteringTestCase(TestCase): + + def setUp(self): + # Disable general logging for test case + logging.disable(logging.CRITICAL) + # Add a test source + self.source = Source.objects.create( + source_type=Source.SOURCE_TYPE_YOUTUBE_CHANNEL, + key='testkey', + name='testname', + directory='testdirectory', + index_schedule=3600, + delete_old_media=False, + days_to_keep=14, + source_resolution=Source.SOURCE_RESOLUTION_1080P, + source_vcodec=Source.SOURCE_VCODEC_VP9, + source_acodec=Source.SOURCE_ACODEC_OPUS, + prefer_60fps=False, + prefer_hdr=False, + fallback=Source.FALLBACK_FAIL + ) + # Add some media + self.media = Media.objects.create( + key='mediakey', + source=self.source, + metadata='{}' + ) + + def test_metadata_20230629(self): + self.media.metadata = all_test_metadata['20230629'] + self.media.save() + + unfiltered = self.media.loaded_metadata + filtered = filter_response(self.media.loaded_metadata) + self.assertNotEqual(len(str(unfiltered)), len(str(filtered))) + + class TasksTestCase(TestCase): def setUp(self): From 4c9fa40bb0e47871caffaf9a3212932727ffc1cb Mon Sep 17 00:00:00 2001 From: tcely Date: Thu, 9 Jan 2025 11:47:10 -0500 Subject: [PATCH 19/26] More filter_response asserts --- tubesync/sync/tests.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/tubesync/sync/tests.py b/tubesync/sync/tests.py index 935ad569..bc199282 100644 --- a/tubesync/sync/tests.py +++ b/tubesync/sync/tests.py @@ -1744,7 +1744,41 @@ class ResponseFilteringTestCase(TestCase): unfiltered = self.media.loaded_metadata filtered = filter_response(self.media.loaded_metadata) - self.assertNotEqual(len(str(unfiltered)), len(str(filtered))) + self.assertIn('formats', unfiltered.keys()) + self.assertIn('formats', filtered.keys()) + # filtered 'http_headers' + self.assertIn('http_headers', unfiltered['formats'][0].keys()) + self.assertNotIn('http_headers', filtered['formats'][0].keys()) + # did not lose any formats + self.assertEqual(48, len(unfiltered['formats'])) + self.assertEqual(48, len(filtered['formats'])) + self.assertEqual(len(unfiltered['formats']), len(filtered['formats'])) + # did reduce the size of the metadata + self.assertTrue(len(str(filtered)) < len(str(unfiltered))) + + url_keys = [] + for format in unfiltered['formats']: + for key in format.keys(): + if 'url' in key: + url_keys.append((format['format_id'], key, format[key],)) + unfiltered_url_keys = url_keys + self.assertEqual(63, len(unfiltered_url_keys), msg=str(unfiltered_url_keys)) + + url_keys = [] + for format in filtered['formats']: + for key in format.keys(): + if 'url' in key: + url_keys.append((format['format_id'], key, format[key],)) + filtered_url_keys = url_keys + self.assertEqual(3, len(filtered_url_keys), msg=str(filtered_url_keys)) + + url_keys = [] + for lang_code, captions in filtered['automatic_captions'].items(): + for caption in captions: + for key in caption.keys(): + if 'url' in key: + url_keys.append((lang_code, caption['ext'], caption[key],)) + self.assertEqual(0, len(url_keys), msg=str(url_keys)) class TasksTestCase(TestCase): From 3e3f80d287c637c34f5c5094aa313531dfbe7b77 Mon Sep 17 00:00:00 2001 From: tcely Date: Thu, 9 Jan 2025 12:04:01 -0500 Subject: [PATCH 20/26] More filter_response asserts --- tubesync/sync/tests.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tubesync/sync/tests.py b/tubesync/sync/tests.py index bc199282..2704058f 100644 --- a/tubesync/sync/tests.py +++ b/tubesync/sync/tests.py @@ -1746,6 +1746,9 @@ class ResponseFilteringTestCase(TestCase): filtered = filter_response(self.media.loaded_metadata) self.assertIn('formats', unfiltered.keys()) self.assertIn('formats', filtered.keys()) + # filtered 'downloader_options' + self.assertIn('downloader_options', unfiltered['formats'][10].keys()) + self.assertNotIn('downloader_options', filtered['formats'][10].keys()) # filtered 'http_headers' self.assertIn('http_headers', unfiltered['formats'][0].keys()) self.assertNotIn('http_headers', filtered['formats'][0].keys()) @@ -1753,6 +1756,10 @@ class ResponseFilteringTestCase(TestCase): self.assertEqual(48, len(unfiltered['formats'])) self.assertEqual(48, len(filtered['formats'])) self.assertEqual(len(unfiltered['formats']), len(filtered['formats'])) + # did not remove everything with url + self.assertIn('original_url', unfiltered.keys()) + self.assertIn('original_url', filtered.keys()) + self.assertEqual(unfiltered['original_url'], filtered['original_url']) # did reduce the size of the metadata self.assertTrue(len(str(filtered)) < len(str(unfiltered))) From 29c39aab1f7096a7267c351cc3ebf0d786c98723 Mon Sep 17 00:00:00 2001 From: tcely Date: Thu, 9 Jan 2025 13:20:22 -0500 Subject: [PATCH 21/26] Add SHRINK_NEW_MEDIA_METADATA setting --- tubesync/sync/tasks.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tubesync/sync/tasks.py b/tubesync/sync/tasks.py index 30f8c827..644918b7 100644 --- a/tubesync/sync/tasks.py +++ b/tubesync/sync/tasks.py @@ -8,6 +8,7 @@ import os import json import math import uuid +from copy import deepcopy from io import BytesIO from hashlib import sha1 from datetime import timedelta, datetime @@ -26,7 +27,7 @@ from common.errors import NoMediaException, DownloadFailedException from common.utils import json_serial from .models import Source, Media, MediaServer from .utils import (get_remote_image, resize_image_to_height, delete_file, - write_text_file) + write_text_file, filter_response) from .filtering import filter_media @@ -304,7 +305,11 @@ def download_media_metadata(media_id): return source = media.source metadata = media.index_metadata() - media.metadata = json.dumps(metadata, separators=(',', ':'), default=json_serial) + if getattr(settings, 'SHRINK_NEW_MEDIA_METADATA', False): + response = filter_response(deepcopy(metadata)) + else: + response = metadata + media.metadata = json.dumps(response, separators=(',', ':'), default=json_serial) upload_date = media.upload_date # Media must have a valid upload date if upload_date: From 0f986949e5ad18195de2265eae83f5360f6c5277 Mon Sep 17 00:00:00 2001 From: tcely Date: Thu, 9 Jan 2025 13:36:43 -0500 Subject: [PATCH 22/26] Have filter_response return a copy, if requested --- tubesync/sync/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tubesync/sync/utils.py b/tubesync/sync/utils.py index b424528b..1d67af38 100644 --- a/tubesync/sync/utils.py +++ b/tubesync/sync/utils.py @@ -1,6 +1,7 @@ import os import re import math +from copy import deepcopy from operator import itemgetter from pathlib import Path from tempfile import NamedTemporaryFile @@ -189,13 +190,18 @@ def _drop_url_keys(arg_dict, key, filter_func): del val_dict[url_key] -def filter_response(response_dict): +def filter_response(arg_dict, copy_arg=False): ''' Clean up the response so as to not store useless metadata in the database. ''' + response_dict = arg_dict # raise an exception for an unexpected argument type if not isinstance(response_dict, dict): raise TypeError(f'response_dict must be a dict, got "{type(response_dict)}"') + + if copy_arg: + response_dict = deepcopy(arg_dict) + # optimize the empty case if not response_dict: return response_dict From 274f19fa15547c1a9d76c967e4134ffafa822aa1 Mon Sep 17 00:00:00 2001 From: tcely Date: Thu, 9 Jan 2025 13:41:23 -0500 Subject: [PATCH 23/26] Use the new copy argument to filter_response --- tubesync/sync/tasks.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tubesync/sync/tasks.py b/tubesync/sync/tasks.py index 644918b7..ab92e2c8 100644 --- a/tubesync/sync/tasks.py +++ b/tubesync/sync/tasks.py @@ -8,7 +8,6 @@ import os import json import math import uuid -from copy import deepcopy from io import BytesIO from hashlib import sha1 from datetime import timedelta, datetime @@ -305,10 +304,9 @@ def download_media_metadata(media_id): return source = media.source metadata = media.index_metadata() + response = metadata if getattr(settings, 'SHRINK_NEW_MEDIA_METADATA', False): - response = filter_response(deepcopy(metadata)) - else: - response = metadata + response = filter_response(metadata, True) media.metadata = json.dumps(response, separators=(',', ':'), default=json_serial) upload_date = media.upload_date # Media must have a valid upload date From 1ff8dfda9897dd8c409feba2649b5ce15f5f7e32 Mon Sep 17 00:00:00 2001 From: tcely Date: Thu, 9 Jan 2025 13:53:12 -0500 Subject: [PATCH 24/26] Use the new copy argument to filter_response --- tubesync/sync/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index 67453f03..10fbbdbd 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -1154,7 +1154,7 @@ class Media(models.Model): data = json.loads(self.metadata or "") compact_json = json.dumps(data, separators=(',', ':'), default=json_serial) - filtered_data = filter_response(data) + filtered_data = filter_response(data, True) filtered_json = json.dumps(filtered_data, separators=(',', ':'), default=json_serial) except Exception as e: log.exception('reduce_data: %s', e) From 6292a9a59dc5d05db79241b9bd2d58f51be3cc6a Mon Sep 17 00:00:00 2001 From: tcely Date: Thu, 9 Jan 2025 14:22:37 -0500 Subject: [PATCH 25/26] Add SHRINK_OLD_MEDIA_METADATA setting --- tubesync/sync/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index 10fbbdbd..bb850af3 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -1168,6 +1168,8 @@ class Media(models.Model): if old_mdl > new_mdl: delta = old_mdl - new_mdl log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})') + if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False): + self.metadata = filtered_json @property From 45d7039188c746e9726562808caa7ed8bbc5f6ee Mon Sep 17 00:00:00 2001 From: tcely Date: Tue, 14 Jan 2025 05:34:59 -0500 Subject: [PATCH 26/26] Only log the extra messages with the new setting --- tubesync/sync/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tubesync/sync/models.py b/tubesync/sync/models.py index bb850af3..a65abdf8 100644 --- a/tubesync/sync/models.py +++ b/tubesync/sync/models.py @@ -1174,7 +1174,8 @@ class Media(models.Model): @property def loaded_metadata(self): - self.reduce_data + if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False): + self.reduce_data try: data = json.loads(self.metadata) if not isinstance(data, dict):