Write to the database in batched transactions

2025-06-25 06:26:37 +00:00 · 2025-05-28 05:16:37 -04:00 · 2025-05-28 05:16:37 -04:00 · 2e1b96bb61
commit 2e1b96bb61
parent 811e36abe0
1 changed files with 80 additions and 38 deletions
--- a/tubesync/sync/tasks.py
+++ b/tubesync/sync/tasks.py
@ -10,6 +10,7 @@ import random
 import requests
 import time
 import uuid
 from collections import deque as queue
 from io import BytesIO
 from hashlib import sha1
 from pathlib import Path
@ -33,7 +34,7 @@ from common.errors import ( NoFormatException, NoMediaException,
 from common.utils import (  django_queryset_generator as qs_gen,
                            remove_enclosed, )
 from .choices import Val, TaskQueue
-from .models import Source, Media, MediaServer
+from .models import Source, Media, MediaServer, Metadata
 from .utils import ( get_remote_image, resize_image_to_height,
                    write_text_file, filter_response, seconds_to_timestr, )
 from .youtube import YouTubeError
@ -302,6 +303,24 @@ def cleanup_removed_media(source, video_keys):
    schedule_media_servers_update()
 def save_db_batch(qs, objs, fields, /):
    assert hasattr(qs, 'bulk_update')
    assert callable(qs.bulk_update)
    assert hasattr(objs, '__len__')
    assert callable(objs.__len__)
    assert isinstance(fields, (tuple, list, set, frozenset))
    num_updated = 0
    num_objs = len(objs)
    with atomic(durable=False):
        num_updated = qs.bulk_update(objs=objs, fields=fields)
    if num_objs == num_updated:
        # this covers at least: list, set, deque
        if hasattr(objs, 'clear') and callable(objs.clear):
            objs.clear()
    return num_updated
@background(schedule=dict(priority=20, run_at=30), queue=Val(TaskQueue.NET), remove_existing_tasks=True)
 def index_source_task(source_id):
    '''
@ -347,6 +366,17 @@ def index_source_task(source_id):
    tvn_format = '{:,}' + f'/{num_videos:,}'
    vn = 0
    video_keys = set()
    db_batch_data = queue(list(), maxlen=50)
    db_fields_data = frozenset((
        'retrieved',
        'value',
    ))
    db_batch_media = queue(list(), maxlen=10)
    db_fields_media = frozenset((
        'duration',
        'published',
        'title',
    ))
    while len(videos) > 0:
        vn += 1
        video = videos.popleft()
@ -355,14 +385,24 @@ def index_source_task(source_id):
        if not key:
            # Video has no unique key (ID), it can't be indexed
            continue
        if len(db_batch_data) == db_batch_data.maxlen:
            save_db_batch(Metadata.objects, db_batch_data, db_fields_data)
        if len(db_batch_media) == db_batch_media.maxlen:
            save_db_batch(Media.objects, db_batch_media, db_fields_media)
        video_keys.add(key)
        update_task_status(task, tvn_format.format(vn))
-        # media, new_media = Media.objects.get_or_create(key=key, source=source)
+        data, new_data = source.videos.defer('value').filter(
-        try:
+            media__isnull=True,
-            media = Media.objects.get(key=key, source=source)
+        ).get_or_create(key=key)
-        except Media.DoesNotExist:
+        data.retrieved = source.last_crawl
-            media = Media(key=key)
+        data.value = video
-        media.source = source
+        db_batch_data.append(data)
        media, new_media = source.media_source.only(
            'uuid',
            'source',
            'key',
            *db_fields_media,
        ).get_or_create(key=key)
        media.duration = float(video.get(fields('duration', media), None) or 0) or None
        media.title = str(video.get(fields('title', media), ''))[:200]
        timestamp = video.get(fields('timestamp', media), None)
@ -373,45 +413,47 @@ def index_source_task(source_id):
        else:
            if published_dt:
                media.published = published_dt
-        try:
+        db_batch_media.append(media)
-            media.save()
+        if not new_media:
        except IntegrityError as e:
            log.error(f'Index media failed: {source} / {media} with "{e}"')
        else:
            log.debug(f'Indexed media: {vn}: {source} / {media}')
        else:
            # log the new media instances
-            new_media_instance = (
+            log.info(f'Indexed new media: {source} / {media}')
-                # new_media or
+            log.info(f'Scheduling tasks to download thumbnail for: {media.key}')
-                media.created and
+            thumbnail_fmt = 'https://i.ytimg.com/vi/{}/{}default.jpg'
-                source.last_crawl and
+            vn_fmt = _('Downloading {} thumbnail for: "{}": {}')
-                media.created >= source.last_crawl
+            for prefix in ('hq', 'sd', 'maxres',):
-            )
+                thumbnail_url = thumbnail_fmt.format(
-            if new_media_instance:
+                    media.key,
-                log.info(f'Indexed new media: {source} / {media}')
+                    prefix,
                log.info(f'Scheduling tasks to download thumbnail for: {media.key}')
                thumbnail_fmt = 'https://i.ytimg.com/vi/{}/{}default.jpg'
                vn_fmt = _('Downloading {} thumbnail for: "{}": {}')
                for prefix in ('hq', 'sd', 'maxres',):
                    thumbnail_url = thumbnail_fmt.format(
                        media.key,
                        prefix,
                    )
                    download_media_thumbnail(
                        str(media.pk),
                        thumbnail_url,
                        verbose_name=vn_fmt.format(prefix, media.key, media.name),
                    )
                log.info(f'Scheduling task to download metadata for: {media.url}')
                verbose_name = _('Downloading metadata for: "{}": {}')
                download_media_metadata(
                    str(media.pk),
                    verbose_name=verbose_name.format(media.key, media.name),
                )
                download_media_thumbnail(
                    str(media.pk),
                    thumbnail_url,
                    verbose_name=vn_fmt.format(prefix, media.key, media.name),
                )
            log.info(f'Scheduling task to download metadata for: {media.url}')
            verbose_name = _('Downloading metadata for: "{}": {}')
            download_media_metadata(
                str(media.pk),
                verbose_name=verbose_name.format(media.key, media.name),
            )
    # Reset task.verbose_name to the saved value
    update_task_status(task, None)
    # Update any remaining items in the batches
    save_db_batch(Metadata.objects, db_batch_data, db_fields_data)
    save_db_batch(Media.objects, db_batch_media, db_fields_media)
    # Cleanup of media no longer available from the source
    cleanup_removed_media(source, video_keys)
    videos = video = None
    db_batch_data.clear()
    db_batch_media.clear()
    # Trigger any signals that we skipped with batches
    vn_fmt = _('Checking all media for "{}"')
    save_all_media_for_source(
        str(source.pk),
        verbose_name=vn_fmt.format(source.name),
    )
@background(schedule=dict(priority=0, run_at=0), queue=Val(TaskQueue.FS))