From ea8223d86b89e25dce656fb613d35112ab27e159 Mon Sep 17 00:00:00 2001
From: Tim <timwhite88@gmail.com>
Date: Thu, 11 Jul 2024 16:17:29 +0800
Subject: [PATCH] move filtering to own module. Add filtering for days_to_keep

---
 tubesync/sync/filtering.py | 98 ++++++++++++++++++++++++++++++++++++++
 tubesync/sync/signals.py   | 69 +--------------------------
 tubesync/sync/tests.py     | 14 +++---
 3 files changed, 107 insertions(+), 74 deletions(-)
 create mode 100644 tubesync/sync/filtering.py

diff --git a/tubesync/sync/filtering.py b/tubesync/sync/filtering.py
new file mode 100644
index 00000000..591b4e57
--- /dev/null
+++ b/tubesync/sync/filtering.py
@@ -0,0 +1,98 @@
+'''
+    All the logic for filtering media from channels to work out if we should skip downloading it or not
+'''
+
+from common.logger import log
+from .models import Source, Media, MediaServer
+from datetime import datetime, timedelta
+from django.utils import timezone
+
+# Check the filter conditions for instance, return is if the Skip property has changed so we can do other things
+def filter_media(instance: Media):
+    # Assume we aren't skipping it, if any of these conditions are true, we skip it
+    skip = False
+
+    # Check if it's published
+    if filter_published(instance):
+        skip = True
+
+    # Check if older than max_cap_age, skip
+    if filter_max_cap(instance):
+        skip = True
+
+    # Check if older than source_cutoff
+    if filter_source_cutoff(instance):
+        skip = True
+
+    # Check if we have filter_text and filter text matches, set unskip
+    if filter_filter_text(instance):
+        skip = True
+
+    # Check if skipping
+    if instance.skip != skip:
+        instance.skip = skip
+        log.warn(f'Media: {instance.source} / {instance} has changed skip setting to {skip}')
+        return True
+
+    return False
+
+
+def filter_published(instance: Media):
+    # Check if the instance is not published, we have to skip then
+    if not instance.published:
+        log.warn(f'Media: {instance.source} / {instance} has no published date '
+                 f'set, marking to be skipped')
+        return True
+    return False
+
+
+# Return True if we are to skip downloading it based on video title not matching the filter text
+def filter_filter_text(instance: Media):
+    filter_text = instance.source.filter_text.strip()
+
+    if not filter_text:
+        return False
+
+    # We match the filter text, so don't skip downloading this
+    if instance.source.is_regex_match(instance.title):
+        log.info(f'Media: {instance.source} / {instance} has a valid '
+                 f'title filter, marking to be unskipped')
+        return False
+
+    log.info(f'Media: {instance.source} / {instance} doesn\'t match '
+             f'title filter, marking to be skipped')
+
+    return True
+
+
+def filter_max_cap(instance: Media):
+    max_cap_age = instance.source.download_cap_date
+    if not max_cap_age:
+        log.debug(f'Media: {instance.source} / {instance} has not max_cap_age '
+                  f'so not skipping based on max_cap_age')
+        return False
+
+    if instance.published <= max_cap_age:
+        log.info(f'Media: {instance.source} / {instance} is too old for '
+                 f'the download cap date, marking to be skipped')
+        return True
+
+    return False
+
+
+# If the source has a cut-off, check the upload date is within the allowed delta
+def filter_source_cutoff(instance: Media):
+    if instance.source.delete_old_media and instance.source.days_to_keep > 0:
+        if not isinstance(instance.published, datetime):
+            # Media has no known published date or incomplete metadata
+            log.warn(f'Media: {instance.source} / {instance} has no published date, skipping')
+            return True
+
+        delta = timezone.now() - timedelta(days=instance.source.days_to_keep)
+        if instance.published < delta:
+            # Media was published after the cutoff date, skip it
+            log.warn(f'Media: {instance.source} / {instance} is older than '
+                     f'{instance.source.days_to_keep} days, skipping')
+            return True
+
+    return False
\ No newline at end of file
diff --git a/tubesync/sync/signals.py b/tubesync/sync/signals.py
index df9e7224..5d436517 100644
--- a/tubesync/sync/signals.py
+++ b/tubesync/sync/signals.py
@@ -13,6 +13,7 @@ from .tasks import (delete_task_by_source, delete_task_by_media, index_source_ta
                     map_task_to_instance, check_source_directory_exists,
                     download_media, rescan_media_server, download_source_images)
 from .utils import delete_file
+from .filtering import filter_media
 
 
 @receiver(pre_save, sender=Source)
@@ -110,7 +111,7 @@ def media_post_save(sender, instance, created, **kwargs):
     # Reset the skip flag if the download cap has changed if the media has not
     # already been downloaded
     if not instance.downloaded and instance.metadata:
-        skip_changed = filter_instance(instance)
+        skip_changed = filter_media(instance)
 
     # Recalculate the "can_download" flag, this may
     # need to change if the source specifications have been changed
@@ -172,73 +173,7 @@ def media_post_save(sender, instance, created, **kwargs):
         )
 
 
-# Check the filter conditions for instance, return is if the Skip property has changed so we can do other things
-def filter_instance(instance):
-    # Assume we aren't skipping it, if any of these conditions are true, we skip it
-    skip = False
 
-    # Check if it's published
-    if filter_instance_published(instance):
-        skip = True
-
-    # Check if older than max_cap_age, skip
-    if filter_instance_max_cap(instance):
-        skip = True
-
-    # Check if we have filter_text and filter text matches, set unskip
-    if filter_instance_filter_text(instance):
-        skip = True
-
-    # Check if skipping
-    if instance.skip != skip:
-        instance.skip = skip
-        log.warn(f'Media: {instance.source} / {instance} has changed skip setting to {skip}')
-        return True
-
-    return False
-
-
-def filter_instance_published(instance):
-    # Check if the instance is not published, we have to skip then
-    if not instance.published:
-        log.warn(f'Media: {instance.source} / {instance} has no published date '
-                 f'set, marking to be skipped')
-        return True
-    return False
-
-
-# Return True if we are to skip downloading it based on filter text not matching
-def filter_instance_filter_text(instance):
-    filter_text = instance.source.filter_text.strip()
-
-    if not filter_text:
-        return False
-
-    # We match the filter text, so don't skip downloading this
-    if instance.source.is_regex_match(instance.title):
-        log.info(f'Media: {instance.source} / {instance} has a valid '
-                 f'title filter, marking to be unskipped')
-        return False
-
-    log.info(f'Media: {instance.source} / {instance} doesn\'t match '
-             f'title filter, marking to be skipped')
-
-    return True
-
-
-def filter_instance_max_cap(instance):
-    max_cap_age = instance.source.download_cap_date
-    if not max_cap_age:
-        log.debug(f'Media: {instance.source} / {instance} has not max_cap_age '
-                  f'so not skipping based on max_cap_age')
-        return False
-
-    if instance.published <= max_cap_age:
-        log.info(f'Media: {instance.source} / {instance} is too old for '
-                 f'the download cap date, marking to be skipped')
-        return True
-
-    return False
 
 @receiver(pre_delete, sender=Media)
 def media_pre_delete(sender, instance, **kwargs):
diff --git a/tubesync/sync/tests.py b/tubesync/sync/tests.py
index e1ea7494..e9d8d986 100644
--- a/tubesync/sync/tests.py
+++ b/tubesync/sync/tests.py
@@ -15,7 +15,7 @@ from django.utils import timezone
 from background_task.models import Task
 from .models import Source, Media
 from .tasks import cleanup_old_media
-from .signals import filter_instance
+from .filtering import filter_media
 
 
 class FrontEndTestCase(TestCase):
@@ -735,7 +735,7 @@ class MediaFilterTestCase(TestCase):
         # Check if unpublished that we skip download it
         self.media.skip = False
         self.media.published = False
-        changed = filter_instance(self.media)
+        changed = filter_media(self.media)
         self.assertTrue(changed)
         self.assertTrue(self.media.skip)
 
@@ -744,7 +744,7 @@ class MediaFilterTestCase(TestCase):
         self.media.skip = True
         self.media.published = timezone.make_aware(datetime(year=2020, month=1, day=1, hour=1,
                                         minute=1, second=1))
-        changed = filter_instance(self.media)
+        changed = filter_media(self.media)
         self.assertTrue(changed)
         self.assertFalse(self.media.skip)
 
@@ -754,7 +754,7 @@ class MediaFilterTestCase(TestCase):
         self.media.skip = False
         self.media.published = timezone.make_aware(datetime(year=2020, month=1, day=1, hour=1,
                                         minute=1, second=1))
-        changed = filter_instance(self.media)
+        changed = filter_media(self.media)
         self.assertTrue(changed)
         self.assertTrue(self.media.skip)
 
@@ -764,7 +764,7 @@ class MediaFilterTestCase(TestCase):
         self.media.skip = True
         self.media.published = timezone.make_aware(datetime(year=2020, month=1, day=1, hour=1,
                                         minute=1, second=1))
-        changed = filter_instance(self.media)
+        changed = filter_media(self.media)
         self.assertTrue(changed)
         self.assertFalse(self.media.skip)
 
@@ -774,7 +774,7 @@ class MediaFilterTestCase(TestCase):
         self.media.skip = False
         self.media.published = timezone.make_aware(datetime(year=2020, month=1, day=1, hour=1,
                                         minute=1, second=1))
-        changed = filter_instance(self.media)
+        changed = filter_media(self.media)
         self.assertTrue(changed)
         self.assertTrue(self.media.skip)
 
@@ -783,7 +783,7 @@ class MediaFilterTestCase(TestCase):
         self.media.source.download_cap = 3600
         self.media.skip = True
         self.media.published = timezone.now()
-        changed = filter_instance(self.media)
+        changed = filter_media(self.media)
         self.assertTrue(changed)
         self.assertFalse(self.media.skip)