mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-11-04 00:20:47 +00:00 
			
		
		
		
	[cleanup] Misc fixes
Cherry-picks from: #3498, #3947 Related: #3949, https://github.com/yt-dlp/yt-dlp/issues/1839#issuecomment-1140313836 Authored by: pukkandan, flashdagger, gamer191
This commit is contained in:
		@@ -786,7 +786,8 @@ class InfoExtractor:
 | 
			
		||||
                self.report_warning(errmsg)
 | 
			
		||||
                return False
 | 
			
		||||
 | 
			
		||||
    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 | 
			
		||||
    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 | 
			
		||||
                                 encoding=None, data=None, headers={}, query={}, expected_status=None):
 | 
			
		||||
        """
 | 
			
		||||
        Return a tuple (page content as string, URL handle).
 | 
			
		||||
 | 
			
		||||
@@ -943,7 +944,7 @@ class InfoExtractor:
 | 
			
		||||
                except ValueError:
 | 
			
		||||
                    raise e
 | 
			
		||||
        except ValueError as ve:
 | 
			
		||||
            errmsg = '%s: Failed to parse JSON ' % video_id
 | 
			
		||||
            errmsg = f'{video_id}: Failed to parse JSON'
 | 
			
		||||
            if fatal:
 | 
			
		||||
                raise ExtractorError(errmsg, cause=ve)
 | 
			
		||||
            else:
 | 
			
		||||
 
 | 
			
		||||
@@ -15,7 +15,7 @@ import time
 | 
			
		||||
import traceback
 | 
			
		||||
 | 
			
		||||
from .common import InfoExtractor, SearchInfoExtractor
 | 
			
		||||
from ..compat import functools
 | 
			
		||||
from ..compat import functools  # isort: split
 | 
			
		||||
from ..compat import (
 | 
			
		||||
    compat_chr,
 | 
			
		||||
    compat_HTTPError,
 | 
			
		||||
@@ -483,6 +483,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
 | 
			
		||||
        if data:
 | 
			
		||||
            return self._parse_json(data, item_id, fatal=fatal)
 | 
			
		||||
 | 
			
		||||
    def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
 | 
			
		||||
        return self._parse_json(self._search_regex(
 | 
			
		||||
            (fr'{regex}\s*{self._YT_INITIAL_BOUNDARY_RE}',
 | 
			
		||||
             regex), webpage, name, default='{}'), video_id, fatal=False, lenient=True)
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _extract_session_index(*data):
 | 
			
		||||
        """
 | 
			
		||||
@@ -2733,54 +2738,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
			
		||||
        chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription'))
 | 
			
		||||
        chapter_title = lambda chapter: self._get_text(chapter, 'title')
 | 
			
		||||
 | 
			
		||||
        return next((
 | 
			
		||||
            filter(None, (
 | 
			
		||||
                self._extract_chapters(
 | 
			
		||||
                    traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
 | 
			
		||||
                    chapter_time, chapter_title, duration)
 | 
			
		||||
                for contents in content_list
 | 
			
		||||
            ))), [])
 | 
			
		||||
        return next(filter(None, (
 | 
			
		||||
            self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
 | 
			
		||||
                                   chapter_time, chapter_title, duration)
 | 
			
		||||
            for contents in content_list)), [])
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _extract_chapters_from_description(description, duration):
 | 
			
		||||
        chapters = [{'start_time': 0}]
 | 
			
		||||
        for timestamp, title in re.findall(
 | 
			
		||||
                r'(?m)^((?:\d+:)?\d{1,2}:\d{2})\b\W*\s(.+?)\s*$', description or ''):
 | 
			
		||||
            start = parse_duration(timestamp)
 | 
			
		||||
            if start and title and chapters[-1]['start_time'] < start < duration:
 | 
			
		||||
                chapters[-1]['end_time'] = start
 | 
			
		||||
                chapters.append({
 | 
			
		||||
                    'start_time': start,
 | 
			
		||||
                    'title': title,
 | 
			
		||||
                })
 | 
			
		||||
    def _extract_chapters_from_description(self, description, duration):
 | 
			
		||||
        return self._extract_chapters(
 | 
			
		||||
            re.findall(r'(?m)^((?:\d+:)?\d{1,2}:\d{2})\b\W*\s(.+?)\s*$', description or ''),
 | 
			
		||||
            chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1],
 | 
			
		||||
            duration=duration, strict=False)
 | 
			
		||||
 | 
			
		||||
    def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True):
 | 
			
		||||
        if not duration:
 | 
			
		||||
            return
 | 
			
		||||
        chapter_list = [{
 | 
			
		||||
            'start_time': chapter_time(chapter),
 | 
			
		||||
            'title': chapter_title(chapter),
 | 
			
		||||
        } for chapter in chapter_list or []]
 | 
			
		||||
        if not strict:
 | 
			
		||||
            chapter_list.sort(key=lambda c: c['start_time'] or 0)
 | 
			
		||||
 | 
			
		||||
        chapters = [{'start_time': 0, 'title': '<Untitled>'}]
 | 
			
		||||
        for idx, chapter in enumerate(chapter_list):
 | 
			
		||||
            if chapter['start_time'] is None or not chapter['title']:
 | 
			
		||||
                self.report_warning(f'Incomplete chapter {idx}')
 | 
			
		||||
            elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
 | 
			
		||||
                chapters[-1]['end_time'] = chapter['start_time']
 | 
			
		||||
                chapters.append(chapter)
 | 
			
		||||
            else:
 | 
			
		||||
                self.report_warning(f'Invalid start time for chapter "{chapter["title"]}"')
 | 
			
		||||
        chapters[-1]['end_time'] = duration
 | 
			
		||||
        return chapters[1:]
 | 
			
		||||
 | 
			
		||||
    def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration):
 | 
			
		||||
        chapters = []
 | 
			
		||||
        last_chapter = {'start_time': 0}
 | 
			
		||||
        for idx, chapter in enumerate(chapter_list or []):
 | 
			
		||||
            title = chapter_title(chapter)
 | 
			
		||||
            start_time = chapter_time(chapter)
 | 
			
		||||
            if start_time is None:
 | 
			
		||||
                continue
 | 
			
		||||
            last_chapter['end_time'] = start_time
 | 
			
		||||
            if start_time < last_chapter['start_time']:
 | 
			
		||||
                if idx == 1:
 | 
			
		||||
                    chapters.pop()
 | 
			
		||||
                    self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title'])
 | 
			
		||||
                else:
 | 
			
		||||
                    self.report_warning(f'Invalid start time for chapter "{title}"')
 | 
			
		||||
                    continue
 | 
			
		||||
            last_chapter = {'start_time': start_time, 'title': title}
 | 
			
		||||
            chapters.append(last_chapter)
 | 
			
		||||
        last_chapter['end_time'] = duration
 | 
			
		||||
        return chapters
 | 
			
		||||
 | 
			
		||||
    def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
 | 
			
		||||
        return self._parse_json(self._search_regex(
 | 
			
		||||
            (fr'{regex}\s*{self._YT_INITIAL_BOUNDARY_RE}',
 | 
			
		||||
             regex), webpage, name, default='{}'), video_id, fatal=False, lenient=True)
 | 
			
		||||
        return chapters if len(chapters) > 1 and chapters[1]['start_time'] else chapters[1:]
 | 
			
		||||
 | 
			
		||||
    def _extract_comment(self, comment_renderer, parent=None):
 | 
			
		||||
        comment_id = comment_renderer.get('commentId')
 | 
			
		||||
@@ -3663,7 +3652,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
			
		||||
 | 
			
		||||
        # Youtube Music Auto-generated description
 | 
			
		||||
        if video_description:
 | 
			
		||||
            mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
 | 
			
		||||
            mobj = re.search(
 | 
			
		||||
                r'''(?xs)
 | 
			
		||||
                    (?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+
 | 
			
		||||
                    (?P<album>[^\n]+)
 | 
			
		||||
                    (?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?
 | 
			
		||||
                    (?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?
 | 
			
		||||
                    (.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?
 | 
			
		||||
                    .+\nAuto-generated\ by\ YouTube\.\s*$
 | 
			
		||||
                ''', video_description)
 | 
			
		||||
            if mobj:
 | 
			
		||||
                release_year = mobj.group('release_year')
 | 
			
		||||
                release_date = mobj.group('release_date')
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user