tubesync/tubesync/sync/models/source.py

599 lines
20 KiB
Python

import os
import re
import uuid
from collections import deque as queue
from pathlib import Path
from django import db
from django.conf import settings
from django.core.exceptions import SuspiciousOperation
from django.core.validators import RegexValidator
from django.utils import timezone
from django.utils.text import slugify
from django.utils.translation import gettext_lazy as _
from ..choices import (Val,
SponsorBlock_Category, YouTube_SourceType, IndexSchedule,
CapChoices, Fallback, FileExtension, FilterSeconds,
SourceResolution, SourceResolutionInteger,
YouTube_VideoCodec, YouTube_AudioCodec,
)
from ..fields import CommaSepChoiceField
from ..youtube import (
get_media_info as get_youtube_media_info,
get_channel_image_info as get_youtube_channel_image_info,
)
from ._migrations import media_file_storage
from ._private import _srctype_dict
class Source(db.models.Model):
'''
A Source is a source of media. Currently, this is either a YouTube channel
or a YouTube playlist.
'''
embed_metadata = db.models.BooleanField(
_('embed metadata'),
default=False,
help_text=_('Embed metadata from source into file'),
)
embed_thumbnail = db.models.BooleanField(
_('embed thumbnail'),
default=False,
help_text=_('Embed thumbnail into the file'),
)
# Fontawesome icons used for the source on the front end
ICONS = _srctype_dict('<i class="fab fa-youtube"></i>')
# Format to use to display a URL for the source
URLS = dict(zip(
YouTube_SourceType.values,
(
'https://www.youtube.com/c/{key}',
'https://www.youtube.com/channel/{key}',
'https://www.youtube.com/playlist?list={key}',
),
))
# Format used to create indexable URLs
INDEX_URLS = dict(zip(
YouTube_SourceType.values,
(
'https://www.youtube.com/c/{key}/{type}',
'https://www.youtube.com/channel/{key}/{type}',
'https://www.youtube.com/playlist?list={key}',
),
))
# Callback functions to get a list of media from the source
INDEXERS = _srctype_dict(get_youtube_media_info)
# Field names to find the media ID used as the key when storing media
KEY_FIELD = _srctype_dict('id')
uuid = db.models.UUIDField(
_('uuid'),
primary_key=True,
editable=False,
default=uuid.uuid4,
help_text=_('UUID of the source'),
)
created = db.models.DateTimeField(
_('created'),
auto_now_add=True,
db_index=True,
help_text=_('Date and time the source was created'),
)
last_crawl = db.models.DateTimeField(
_('last crawl'),
db_index=True,
null=True,
blank=True,
help_text=_('Date and time the source was last crawled'),
)
source_type = db.models.CharField(
_('source type'),
max_length=1,
db_index=True,
choices=YouTube_SourceType.choices,
default=YouTube_SourceType.CHANNEL,
help_text=_('Source type'),
)
key = db.models.CharField(
_('key'),
max_length=100,
db_index=True,
unique=True,
help_text=_('Source key, such as exact YouTube channel name or playlist ID'),
)
name = db.models.CharField(
_('name'),
max_length=100,
db_index=True,
unique=True,
help_text=_('Friendly name for the source, used locally in TubeSync only'),
)
directory = db.models.CharField(
_('directory'),
max_length=100,
db_index=True,
unique=True,
help_text=_('Directory name to save the media into'),
)
media_format = db.models.CharField(
_('media format'),
max_length=200,
default=settings.MEDIA_FORMATSTR_DEFAULT,
help_text=_('File format to use for saving files, detailed options at bottom of page.'),
)
target_schedule = db.models.DateTimeField(
_('target schedule'),
blank=True,
db_index=True,
default=timezone.now,
help_text=_('Date and time when the task to index the source should begin'),
)
index_schedule = db.models.IntegerField(
_('index schedule'),
choices=IndexSchedule.choices,
db_index=True,
default=IndexSchedule.EVERY_24_HOURS,
help_text=_('Schedule of how often to index the source for new media'),
)
download_media = db.models.BooleanField(
_('download media'),
default=True,
help_text=_('Download media from this source, if not selected the source will only be indexed'),
)
index_videos = db.models.BooleanField(
_('index videos'),
default=True,
help_text=_('Index video media from this source'),
)
index_streams = db.models.BooleanField(
_('index streams'),
default=False,
help_text=_('Index live stream media from this source'),
)
download_cap = db.models.IntegerField(
_('download cap'),
choices=CapChoices.choices,
default=CapChoices.CAP_NOCAP,
help_text=_('Do not download media older than this capped date'),
)
delete_old_media = db.models.BooleanField(
_('delete old media'),
default=False,
help_text=_('Delete old media after "days to keep" days?'),
)
days_to_keep = db.models.PositiveSmallIntegerField(
_('days to keep'),
default=14,
help_text=_(
'If "delete old media" is ticked, the number of days after which '
'to automatically delete media'
),
)
filter_text = db.models.CharField(
_('filter string'),
max_length=200,
default='',
blank=True,
help_text=_('Regex compatible filter string for video titles'),
)
filter_text_invert = db.models.BooleanField(
_('invert filter text matching'),
default=False,
help_text=_('Invert filter string regex match, skip any matching titles when selected'),
)
filter_seconds = db.models.PositiveIntegerField(
_('filter seconds'),
blank=True,
null=True,
help_text=_('Filter Media based on Min/Max duration. Leave blank or 0 to disable filtering'),
)
filter_seconds_min = db.models.BooleanField(
_('filter seconds min/max'),
choices=FilterSeconds.choices,
default=Val(FilterSeconds.MIN),
help_text=_(
'When Filter Seconds is > 0, do we skip on minimum (video shorter than limit) or maximum (video '
'greater than maximum) video duration'
),
)
delete_removed_media = db.models.BooleanField(
_('delete removed media'),
default=False,
help_text=_('Delete media that is no longer on this playlist'),
)
delete_files_on_disk = db.models.BooleanField(
_('delete files on disk'),
default=False,
help_text=_('Delete files on disk when they are removed from TubeSync'),
)
source_resolution = db.models.CharField(
_('source resolution'),
max_length=8,
db_index=True,
choices=SourceResolution.choices,
default=SourceResolution.VIDEO_1080P,
help_text=_('Source resolution, desired video resolution to download'),
)
source_vcodec = db.models.CharField(
_('source video codec'),
max_length=8,
db_index=True,
choices=YouTube_VideoCodec.choices,
default=YouTube_VideoCodec.VP9,
help_text=_('Source video codec, desired video encoding format to download (ignored if "resolution" is audio only)'),
)
source_acodec = db.models.CharField(
_('source audio codec'),
max_length=8,
db_index=True,
choices=YouTube_AudioCodec.choices,
default=YouTube_AudioCodec.OPUS,
help_text=_('Source audio codec, desired audio encoding format to download'),
)
prefer_60fps = db.models.BooleanField(
_('prefer 60fps'),
default=True,
help_text=_('Where possible, prefer 60fps media for this source'),
)
prefer_hdr = db.models.BooleanField(
_('prefer hdr'),
default=False,
help_text=_('Where possible, prefer HDR media for this source'),
)
fallback = db.models.CharField(
_('fallback'),
max_length=1,
db_index=True,
choices=Fallback.choices,
default=Fallback.NEXT_BEST_HD,
help_text=_('What do do when media in your source resolution and codecs is not available'),
)
copy_channel_images = db.models.BooleanField(
_('copy channel images'),
default=False,
help_text=_('Copy channel banner and avatar. These may be detected and used by some media servers'),
)
copy_thumbnails = db.models.BooleanField(
_('copy thumbnails'),
default=False,
help_text=_('Copy thumbnails with the media, these may be detected and used by some media servers'),
)
write_nfo = db.models.BooleanField(
_('write nfo'),
default=False,
help_text=_('Write an NFO file in XML with the media info, these may be detected and used by some media servers'),
)
write_json = db.models.BooleanField(
_('write json'),
default=False,
help_text=_('Write a JSON file with the media info, these may be detected and used by some media servers'),
)
has_failed = db.models.BooleanField(
_('has failed'),
default=False,
help_text=_('Source has failed to index media'),
)
write_subtitles = db.models.BooleanField(
_('write subtitles'),
default=False,
help_text=_('Download video subtitles'),
)
auto_subtitles = db.models.BooleanField(
_('accept auto-generated subs'),
default=False,
help_text=_('Accept auto-generated subtitles'),
)
sub_langs = db.models.CharField(
_('subs langs'),
max_length=30,
default='en',
help_text=_('List of subtitles langs to download, comma-separated. Example: en,fr or all,-fr,-live_chat'),
validators=[
RegexValidator(
regex=r"^(\-?[\_\.a-zA-Z-]+(,|$))+",
message=_('Subtitle langs must be a comma-separated list of langs. example: en,fr or all,-fr,-live_chat'),
),
],
)
enable_sponsorblock = db.models.BooleanField(
_('enable sponsorblock'),
default=True,
help_text=_('Use SponsorBlock?'),
)
sponsorblock_categories = CommaSepChoiceField(
_('removed categories'),
max_length=128,
possible_choices=SponsorBlock_Category.choices,
all_choice='all',
allow_all=True,
all_label='(All Categories)',
default='all',
help_text=_('Select the SponsorBlock categories that you wish to be removed from downloaded videos.'),
)
def __str__(self):
return self.name
class Meta:
verbose_name = _('Source')
verbose_name_plural = _('Sources')
@property
def icon(self):
return self.ICONS.get(self.source_type)
@property
def slugname(self):
replaced = self.name.replace('_', '-').replace('&', 'and').replace('+', 'and')
return slugify(replaced)[:80]
def deactivate(self):
self.download_media = False
self.index_streams = False
self.index_videos = False
self.index_schedule = IndexSchedule.NEVER
self.save(update_fields={
'download_media',
'index_streams',
'index_videos',
'index_schedule',
})
@property
def is_active(self):
active = (
self.download_media or
self.index_streams or
self.index_videos
)
return self.index_schedule and active
@property
def is_audio(self):
return self.source_resolution == SourceResolution.AUDIO.value
@property
def is_playlist(self):
return self.source_type == YouTube_SourceType.PLAYLIST.value
@property
def is_video(self):
return not self.is_audio
@property
def download_cap_date(self):
delta = self.download_cap
if delta > 0:
return timezone.now() - timezone.timedelta(seconds=delta)
else:
return False
@property
def days_to_keep_date(self):
delta = self.days_to_keep
if delta > 0:
return timezone.now() - timezone.timedelta(days=delta)
else:
return False
@property
def task_run_at_dt(self):
now = timezone.now()
when = now.replace(minute=0, second=0, microsecond=0)
def advance_hour(arg_dt, target_hour, /):
delta_hours = ((24 + target_hour) - arg_dt.hour) % 24
return arg_dt + timezone.timedelta(hours=delta_hours)
def advance_day(arg_dt, target_weekday, /):
delta_days = ((7 + target_weekday) - arg_dt.weekday) % 7
return arg_dt + timezone.timedelta(days=delta_days)
if self.target_schedule is None:
self.target_schedule = when
if Val(IndexSchedule.EVERY_24_HOURS) > self.index_schedule:
self.target_schedule = now + timezone.timedelta(
seconds=1+self.index_schedule,
)
elif Val(IndexSchedule.EVERY_7_DAYS) > self.index_schedule:
self.target_schedule = advance_hour(
when.replace(hour=1+when.hour),
self.target_schedule.hour,
)
if now < self.target_schedule:
return self.target_schedule
when = advance_hour(when, self.target_schedule.hour)
when = advance_day(when, self.target_schedule.weekday)
self.target_schedule = when
return when
@property
def extension(self):
'''
The extension is also used by youtube-dl to set the output container. As
it is possible to quite easily pick combinations of codecs and containers
which are invalid (e.g. OPUS audio in an MP4 container) just set this for
people. All video is set to mkv containers, audio-only is set to m4a or ogg
depending on audio codec.
'''
if self.is_audio:
if self.source_acodec == Val(YouTube_AudioCodec.MP4A):
return Val(FileExtension.M4A)
elif self.source_acodec == Val(YouTube_AudioCodec.OPUS):
return Val(FileExtension.OGG)
else:
raise ValueError('Unable to choose audio extension, uknown acodec')
else:
return Val(FileExtension.MKV)
@classmethod
def create_url(cls, source_type, key):
url = cls.URLS.get(source_type)
return url.format(key=key)
@classmethod
def create_index_url(cls, source_type, key, type):
url = cls.INDEX_URLS.get(source_type)
return url.format(key=key, type=type)
@property
def url(self):
return self.__class__.create_url(self.source_type, self.key)
def get_index_url(self, type):
return self.__class__.create_index_url(self.source_type, self.key, type)
@property
def format_summary(self):
if self.is_audio:
vc = 'none'
else:
vc = self.source_vcodec
ac = self.source_acodec
f = ' 60FPS' if self.is_video and self.prefer_60fps else ''
h = ' HDR' if self.is_video and self.prefer_hdr else ''
return f'{self.source_resolution} (video:{vc}, audio:{ac}){f}{h}'.strip()
@property
def directory_path(self):
download_dir = Path(media_file_storage.location)
return download_dir / self.type_directory_path
@property
def type_directory_path(self):
if settings.SOURCE_DOWNLOAD_DIRECTORY_PREFIX:
if self.is_audio:
return Path(settings.DOWNLOAD_AUDIO_DIR) / self.directory
else:
return Path(settings.DOWNLOAD_VIDEO_DIR) / self.directory
else:
return Path(self.directory)
def make_directory(self):
return os.makedirs(self.directory_path, exist_ok=True)
@property
def get_image_url(self):
if self.is_playlist:
raise SuspiciousOperation('This source is a playlist so it doesn\'t have thumbnail.')
return get_youtube_channel_image_info(self.url)
def directory_exists(self):
return (os.path.isdir(self.directory_path) and
os.access(self.directory_path, os.W_OK))
@property
def key_field(self):
return self.KEY_FIELD.get(self.source_type, '')
@property
def source_resolution_height(self):
return SourceResolutionInteger.get(self.source_resolution, 0)
@property
def can_fallback(self):
return self.fallback != Val(Fallback.FAIL)
@property
def example_media_format_dict(self):
'''
Populates a dict with real-ish and some placeholder data for media name
format strings. Used for example filenames and media_format validation.
'''
fmt = []
if self.source_resolution:
fmt.append(self.source_resolution)
if self.source_vcodec:
fmt.append(self.source_vcodec.lower())
if self.source_acodec:
fmt.append(self.source_acodec.lower())
if self.prefer_60fps:
fmt.append('60fps')
if self.prefer_hdr:
fmt.append('hdr')
now = timezone.now()
return {
'yyyymmdd': now.strftime('%Y%m%d'),
'yyyy_mm_dd': now.strftime('%Y-%m-%d'),
'yyyy': now.strftime('%Y'),
'mm': now.strftime('%m'),
'dd': now.strftime('%d'),
'source': self.slugname,
'source_full': self.name,
'uploader': 'Some Channel Name',
'title': 'some-media-title-name',
'title_full': 'Some Media Title Name',
'key': 'SoMeUnIqUiD',
'format': '-'.join(fmt),
'playlist_title': 'Some Playlist Title',
'video_order': '01',
'ext': self.extension,
'resolution': self.source_resolution if self.source_resolution else '',
'height': '720' if self.source_resolution else '',
'width': '1280' if self.source_resolution else '',
'vcodec': self.source_vcodec.lower() if self.source_vcodec else '',
'acodec': self.source_acodec.lower(),
'fps': '24' if self.source_resolution else '',
'hdr': 'hdr' if self.source_resolution else ''
}
def get_example_media_format(self):
try:
return self.media_format.format(**self.example_media_format_dict)
except Exception:
return ''
def is_regex_match(self, media_item_title):
if not self.filter_text:
return True
return bool(re.search(self.filter_text, media_item_title))
def get_index(self, type):
indexer = self.INDEXERS.get(self.source_type, None)
if not callable(indexer):
raise Exception(f'Source type f"{self.source_type}" has no indexer')
days = None
if self.download_cap_date:
days = timezone.timedelta(seconds=self.download_cap).days
response = indexer(self.get_index_url(type=type), days=days)
if not isinstance(response, dict):
return list()
entries = response.get('entries', list())
return entries
def index_media(self):
'''
Index the media source returning a queue of media metadata as dicts.
'''
entries = queue(list(), getattr(settings, 'MAX_ENTRIES_PROCESSING', 0) or None)
if self.index_videos:
entries.extend(reversed(self.get_index('videos')))
# Playlists do something different that I have yet to figure out
if not self.is_playlist:
if self.index_streams:
streams = self.get_index('streams')
if entries.maxlen is None or 0 == len(entries):
entries.extend(reversed(streams))
else:
# share the queue between streams and videos
allowed_streams = max(
entries.maxlen // 2,
entries.maxlen - len(entries),
)
entries.extend(reversed(streams[: allowed_streams]))
return entries