tubesync/tubesync/sync/filtering.py

200 lines
6.5 KiB
Python

"""
All the logic for filtering media from channels to work out if we should skip downloading it or not
"""
from common.logger import log
from .models import Media
from datetime import datetime
from django.utils import timezone
from .overrides.custom_filter import filter_custom
# Check the filter conditions for instance, return is if the Skip property has changed so we can do other things
def filter_media(instance: Media):
unskip = True
# Assume we aren't skipping it, if any of these conditions are true, we skip it
skip = False
# Check if it's published
is_published = not filter_published(instance)
if not skip and not is_published:
skip = True
# Check if older than max_cap_age, skip
video_too_old = is_published and filter_max_cap(instance)
if not skip and video_too_old:
skip = True
# Check if older than source_cutoff
download_kept = not filter_source_cutoff(instance)
if not skip and not download_kept:
skip = True
# Check if we have filter_text and filter text matches
if not skip and filter_filter_text(instance):
skip = True
unskip = False
# Check if the video is longer than the max, or shorter than the min
if not skip and filter_duration(instance):
skip = True
unskip = False
# If we aren't already skipping the file, call our custom function that can be overridden
if not skip and filter_custom(instance):
log.info(f"Media: {instance.source} / {instance} has been skipped by Custom Filter")
skip = True
unskip = False
keep_newly_published_video = (
is_published and download_kept and
not (instance.downloaded or video_too_old)
)
# Check if skipping
if not keep_newly_published_video:
unskip = False
if instance.skip != skip:
was_skipped = instance.skip
instance.skip = skip
if was_skipped and not (unskip or skip):
instance.skip = True
if instance.skip != was_skipped:
log.info(
f"Media: {instance.source} / {instance} has changed skip setting to {instance.skip}"
)
return True
return False
def filter_published(instance: Media):
# Check if the instance is not published, we have to skip then
if not isinstance(instance.published, datetime):
log.info(
f"Media: {instance.source} / {instance} has no published date "
f"set, marking to be skipped"
)
return True
return False
# Return True if we are to skip downloading it based on video title not matching the filter text
def filter_filter_text(instance: Media):
filter_text = instance.source.filter_text.strip()
if not filter_text:
return False
if not instance.source.filter_text_invert:
# We match the filter text, so don't skip downloading this
if instance.source.is_regex_match(instance.title):
log.info(
f"Media: {instance.source} / {instance} has a valid "
f"title filter, not marking to be skipped"
)
return False
log.info(
f"Media: {instance.source} / {instance} doesn't match "
f"title filter, marking to be skipped"
)
return True
if instance.source.is_regex_match(instance.title):
log.info(
f"Media: {instance.source} / {instance} matches inverted "
f"title filter, marking to be skipped"
)
return True
log.info(
f"Media: {instance.source} / {instance} does not match the inverted "
f"title filter, not marking to be skipped"
)
return False
def filter_max_cap(instance: Media):
if instance.published is None:
log.debug(
f"Media: {instance.source} / {instance} has no published date "
f"set (likely not downloaded metadata) so not filtering based on "
f"publish date"
)
return False
max_cap_age = instance.source.download_cap_date
if max_cap_age and instance.published <= max_cap_age:
# log new media instances, not every media instance every time
if not instance.skip:
log.info(
f"Media: {instance.source} / {instance} is too old for "
f"the download cap date, marking to be skipped"
)
return True
return False
# If the source has a cut-off, check the download date is within the allowed delta
def filter_source_cutoff(instance: Media):
if instance.source.delete_old_media and instance.source.days_to_keep_date:
if not instance.downloaded or not isinstance(instance.download_date, datetime):
return False
days_to_keep_age = instance.source.days_to_keep_date
if instance.download_date < days_to_keep_age:
# Media has expired, skip it
log.info(
f"Media: {instance.source} / {instance} is older than "
f"{instance.source.days_to_keep} days, skipping"
)
return True
return False
# Check if we skip based on duration (min/max)
def filter_duration(instance: Media):
if not instance.source.filter_seconds:
return False
duration = instance.duration
if not duration:
# Attempt fallback to slower metadata field, this adds significant time, new media won't need this
# Tests show fetching instance.duration can take as long as the rest of the filtering
if instance.metadata_duration:
duration = instance.metadata_duration
instance.duration = duration
instance.save()
else:
log.info(
f"Media: {instance.source} / {instance} has no duration stored, not skipping"
)
return False
duration_limit = instance.source.filter_seconds
if instance.source.filter_seconds_min and duration < duration_limit:
# Filter out videos that are shorter than the minimum
log.info(
f"Media: {instance.source} / {instance} is shorter ({duration}) than "
f"the minimum duration ({duration_limit}), skipping"
)
return True
if not instance.source.filter_seconds_min and duration > duration_limit:
# Filter out videos that are greater than the maximum
log.info(
f"Media: {instance.source} / {instance} is longer ({duration}) than "
f"the maximum duration ({duration_limit}), skipping"
)
return True
return False