import os import re import types import urllib.parse import xml.etree.ElementTree from .common import InfoExtractor from .commonprotocols import RtmpIE from .youtube import YoutubeIE from ..compat import compat_etree_fromstring from ..cookies import LenientSimpleCookie from ..networking.exceptions import HTTPError from ..networking.impersonate import ImpersonateTarget from ..utils import ( KNOWN_EXTENSIONS, MEDIA_EXTENSIONS, ExtractorError, UnsupportedError, determine_ext, determine_protocol, dict_get, extract_basic_auth, filter_dict, format_field, int_or_none, is_html, js_to_json, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_resolution, smuggle_url, str_or_none, traverse_obj, try_call, unescapeHTML, unified_timestamp, unsmuggle_url, update_url, update_url_query, url_or_none, urlhandle_detect_ext, urljoin, variadic, xpath_attr, xpath_text, xpath_with_ns, ) from ..utils._utils import _UnsafeExtensionError class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' _NETRC_MACHINE = False # Suppress username warning _TESTS = [{ # Direct link # https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d 'url': 'https://media.w3.org/2010/05/sintel/trailer.mp4', 'md5': '67d406c2bcb6af27fa886f31aa934bbe', 'info_dict': { 'id': 'trailer', 'ext': 'mp4', 'title': 'trailer', 'direct': True, 'timestamp': 1273772943, 'upload_date': '20100513', }, }, { # Direct link: No HEAD support # https://github.com/ytdl-org/youtube-dl/issues/4032 'url': 'http://ai-radio.org:8000/radio.opus', 'info_dict': { 'id': 'radio', 'ext': 'opus', 'title': 'radio', }, 'skip': 'Invalid URL', }, { # Direct link: Incorrect MIME type # https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d 'url': 'https://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', 'md5': '4ccbebe5f36706d85221f204d7eb5913', 'info_dict': { 'id': '5_Lennart_Poettering_-_Systemd', 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'direct': True, 'timestamp': 1416498816, 'upload_date': '20141120', }, }, { # Direct link: Live HLS; https://castr.com/hlsplayer/ # https://github.com/yt-dlp/yt-dlp/pull/6775 'url': 'https://stream-akamai.castr.com/5b9352dbda7b8c769937e459/live_2361c920455111ea85db6911fe397b9e/index.fmp4.m3u8', 'info_dict': { 'id': 'index.fmp4', 'ext': 'mp4', 'title': str, 'live_status': 'is_live', }, 'params': {'skip_download': 'm3u8'}, }, { # Compressed when `Accept-Encoding: *` # https://github.com/ytdl-org/youtube-dl/commit/a074e922967fa571d4f1abb1773c711747060f00 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', 'info_dict': { 'id': 'FictionJunction-Parallel_Hearts', 'ext': 'flac', 'title': 'FictionJunction-Parallel_Hearts', }, 'skip': 'Invalid URL', }, { # `Content-Encoding: br` when `Accept-Encoding: *` # https://github.com/yt-dlp/yt-dlp/commit/3e01ce744a981d8f19ae77ec695005e7000f4703 'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867', 'md5': 'a9a2cad3e54f78e4680c6deef82417e9', 'info_dict': { 'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867', 'ext': 'mp4', 'title': 'čauky lidi 70 finall', 'age_limit': 0, 'description': 'md5:47b2673a5b76780d9d329783e1fbf5aa', 'direct': True, 'duration': 318.0, 'thumbnail': r're:https?://media\.extra\.cz/static/img/.+\.jpg', 'timestamp': 1654513791, 'upload_date': '20220606', }, 'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}}, }, { # HLS: `Content-Type: audio/mpegurl`; https://bitmovin.com/demos/stream-test # https://github.com/ytdl-org/youtube-dl/commit/20938f768b16c945c6041ba3c0a7ae1a4e790881 'url': 'https://cdn.bitmovin.com/content/assets/art-of-motion-dash-hls-progressive/m3u8s/f08e80da-bf1d-4e3d-8899-f0f6155f6efa.m3u8', 'info_dict': { 'id': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa', 'ext': 'mp4', 'title': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa', 'duration': 211, 'timestamp': 1737363648, 'upload_date': '20250120', }, 'params': {'skip_download': 'm3u8'}, }, { # HLS: `Content-Type: text/plain`; https://github.com/grafov/m3u8 # https://github.com/ytdl-org/youtube-dl/commit/edd9b71c2cca7e5a0df8799710d9ad410ec77d29 'url': 'https://raw.githubusercontent.com/grafov/m3u8/refs/heads/master/sample-playlists/master.m3u8', 'info_dict': { 'id': 'master', 'ext': 'mp4', 'title': 'master', }, 'params': {'skip_download': 'm3u8'}, }, { # MPEG-DASH; https://bitmovin.com/demos/stream-test # https://github.com/ytdl-org/youtube-dl/commit/9d939cec48f06a401fb79eb078c1fc50b2aefbe1 'url': 'https://cdn.bitmovin.com/content/assets/art-of-motion-dash-hls-progressive/mpds/f08e80da-bf1d-4e3d-8899-f0f6155f6efa.mpd', 'info_dict': { 'id': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa', 'ext': 'mp4', 'title': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa', 'timestamp': 1737363728, 'upload_date': '20250120', }, 'params': {'skip_download': True}, }, { # Live MPEG-DASH; https://livesim2.dashif.org/urlgen/create # https://github.com/yt-dlp/yt-dlp/pull/12256 'url': 'https://livesim2.dashif.org/livesim2/ato_10/testpic_2s/Manifest.mpd', 'info_dict': { 'id': 'Manifest', 'ext': 'mp4', 'title': str, 'live_status': 'is_live', }, 'params': {'skip_download': 'livestream'}, }, { # SMIL # https://github.com/ytdl-org/youtube-dl/pull/6428 'url': 'https://api.new.livestream.com/accounts/21/events/7954027/videos/166558123.secure.smil', 'info_dict': { 'id': '166558123.secure', 'ext': 'mp4', 'title': '73fb2379-a624-4b6c-bce4-e46086007f2c', }, 'params': {'skip_download': 'smil'}, }, { # XSPF playlist; https://shellac-archive.ch/de/index.html # https://github.com/ytdl-org/youtube-dl/commit/1de5cd3ba51ce67d9a1cd3b40157058e78e46692 'url': 'https://shellac-archive.ch/repository/xspf/22-AL0019Z.xspf', 'info_dict': { 'id': '22-AL0019Z', }, 'playlist_count': 12, 'params': {'skip_download': True}, }, { # RSS feed # https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', 'info_dict': { 'id': 'https://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', 'description': 'md5:512ae5f840e52eb3c0d08d4bed08eb3e', }, 'playlist_mincount': 11, }, { # RSS feed: Includes enclosure, description, and thumbnails # https://github.com/ytdl-org/youtube-dl/pull/27405 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', 'info_dict': { 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', 'title': '100% Hydrogen ', 'description': 'md5:7ec96327f8b91a2549a2e74f064022a1', }, 'playlist_count': 1, 'params': {'skip_download': True}, }, { # RSS feed: Includes guid 'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', 'info_dict': { 'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', 'title': 'The Little Red Podcast', 'description': 'md5:be809a44b63b0c56fb485caf68685520', }, 'playlist_mincount': 76, }, { # RSS feed: Includes enclosure and unsupported URLs # https://github.com/ytdl-org/youtube-dl/pull/16189 'url': 'https://www.interfax.ru/rss.asp', 'info_dict': { 'id': 'https://www.interfax.ru/rss.asp', 'title': 'Интерфакс', 'description': 'md5:49b6b8905772efba21923942bbc0444c', }, 'playlist_mincount': 25, }, { # Webpage starts with a duplicate UTF-8 BOM # https://github.com/yt-dlp/yt-dlp/commit/80e8493ee7c3083f4e215794e4a67ba5265f24f7 'url': 'https://www.filmarkivet.se/movies/paris-d-moll/', 'md5': 'df02cadc719dcc63d43288366f037754', 'info_dict': { 'id': 'paris-d-moll', 'ext': 'mp4', 'title': 'Paris d-moll', 'age_limit': 0, 'description': 'md5:319e37ea5542293db37e1e13072fe330', 'thumbnail': r're:https?://www\.filmarkivet\.se/wp-content/uploads/.+\.jpg', }, }, { # Multiple HTML5 videos # https://github.com/ytdl-org/youtube-dl/pull/14107 'url': 'https://www.dagbladet.no/nyheter/etter-ett-ars-planlegging-klaffet-endelig-alt---jeg-matte-ta-en-liten-dans/60413035', 'info_dict': { 'id': '60413035', 'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans', 'age_limit': 0, 'description': 'md5:bbb4e12e42e78609a74fd421b93b1239', 'thumbnail': r're:https?://www\.dagbladet\.no/images/.+', }, 'playlist_count': 2, }, { # Cinerama Player # https://github.com/ytdl-org/youtube-dl/commit/501f13fbf3d1f7225f91e3e0ad008df2cd3219f1 'url': 'https://www.abc.net.au/res/libraries/cinerama2/examples/single_clip.htm', 'info_dict': { 'id': 'single_clip', 'title': 'Single Clip player examples', 'age_limit': 0, }, 'playlist_count': 3, }, { # FIXME: Improve extraction # Flowplayer # https://github.com/ytdl-org/youtube-dl/commit/4d805e063c6c4ffd557d7c7cb905a3ed9c926b08 'url': 'https://flowplayer.com/resources/demos/standard-setup', 'info_dict': { 'id': 'playlist', 'ext': 'mp4', 'title': 'playlist', 'duration': 13, 'timestamp': 1539082175, 'upload_date': '20181009', }, 'params': {'skip_download': 'm3u8'}, }, { # JW Player: YouTube # https://github.com/ytdl-org/youtube-dl/commit/a0f719854463c6f4226e4042dfa80c1b17154e1d 'url': 'https://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', 'info_dict': { 'id': 'Mrj4DVp2zeA', 'ext': 'mp4', 'title': 'Using Discovery, The National Archives’ online catalogue', 'age_limit': 0, 'availability': 'unlisted', 'categories': ['Education'], 'channel': 'The National Archives UK', 'channel_follower_count': int, 'channel_id': 'UCUuzebc1yADDJEnOLA5P9xw', 'channel_url': 'https://www.youtube.com/channel/UCUuzebc1yADDJEnOLA5P9xw', 'chapters': 'count:13', 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6', 'duration': 3066, 'like_count': int, 'live_status': 'not_live', 'media_type': 'video', 'playable_in_embed': True, 'tags': 'count:5', 'thumbnail': r're:https?://i\.ytimg\.com/vi/.+', 'timestamp': 1423757117, 'upload_date': '20150212', 'uploader': 'The National Archives UK', 'uploader_id': '@TheNationalArchivesUK', 'uploader_url': 'https://www.youtube.com/@TheNationalArchivesUK', 'view_count': int, }, 'add_ie': ['Youtube'], }, { # JW Player: Complex # https://github.com/ytdl-org/youtube-dl/commit/a4a554a79354981fcab55de8eaab7b95a40bbb48 'url': 'https://www.indiedb.com/games/king-machine/videos', 'info_dict': { 'id': 'videos-1', 'ext': 'mp4', 'title': 'Videos & Audio - King Machine (1)', 'age_limit': 0, 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.', 'thumbnail': r're:https?://media\.indiedb\.com/cache/images/.+\.jpg', '_old_archive_ids': ['generic videos'], }, }, { # JW Player: JSON Feed URL # https://github.com/yt-dlp/yt-dlp/issues/1476 'url': 'https://foodschmooze.org/', 'info_dict': { 'id': 'z00Frhnw', 'ext': 'mp4', 'title': 'Grilling Beef Tenderloin', 'description': '', 'duration': 392.0, 'thumbnail': r're:https?://cdn\.jwplayer\.com/v2/media/.+', 'timestamp': 1465313685, 'upload_date': '20160607', }, 'params': {'skip_download': 'm3u8'}, }, { # JW Player: RTMP # https://github.com/ytdl-org/youtube-dl/issues/11993 'url': 'http://www.suffolk.edu/sjc/live.php', 'info_dict': { 'id': 'live', 'ext': 'flv', 'title': 'Massachusetts Supreme Judicial Court Oral Arguments', }, 'skip': 'Invalid URL', }, { # KVS Player v7.3.3 # kt_player.js?v=5.1.1 'url': 'https://bogmedia.org/videos/21217/40-nochey-2016/', 'md5': '94166bdb26b4cb1fb9214319a629fc51', 'info_dict': { 'id': '21217', 'ext': 'mp4', 'title': '40 ночей (2016) - BogMedia.org', 'age_limit': 0, 'description': 'md5:4e6d7d622636eb7948275432eb256dc3', 'display_id': '40-nochey-2016', 'thumbnail': r're:https?://bogmedia\.org/contents/videos_screenshots/.+\.jpg', }, }, { # KVS Player v7.7.11 # kt_player.js?v=5.5.1 # https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7 'url': 'https://youix.com/video/leningrad-zoj/', 'md5': '94f96ba95706dc3880812b27b7d8a2b8', 'info_dict': { 'id': '18485', 'ext': 'mp4', 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', 'age_limit': 0, 'display_id': 'leningrad-zoj', 'thumbnail': r're:https?://youix\.com/contents/videos_screenshots/.+\.jpg', }, }, { # KVS Player v7.10.3 # kt_player.js?v=12 # https://github.com/ytdl-org/youtube-dl/commit/fc2beab0e701c497a003f11fef5c0df54fba1da3 'url': 'https://shooshtime.com/videos/346037/fresh-out-of-the-shower/', 'md5': 'c9a97ad528607a4516d4df83a3aeb12c', 'info_dict': { 'id': '346037', 'ext': 'mp4', 'title': 'Fresh out of the shower - Shooshtime', 'age_limit': 18, 'description': 'md5:efd70fd3973f8750d285c743b910580a', 'display_id': 'fresh-out-of-the-shower', 'thumbnail': r're:https?://i\.shoosh\.co/contents/videos_screenshots/.+\.jpg', }, 'expected_warnings': ['Untested major version'], }, { # FIXME: Unable to extract flashvars # KVS Player v7.11.4 # kt_player.js?v=2.11.5.1 # https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7 'url': 'https://www.kvs-demo.com/video/105/kelis-4th-of-july/', 'info_dict': { 'id': '105', 'ext': 'mp4', 'title': 'Kelis - 4th Of July', }, }, { # KVS Player v7.11.4 # kt_player.js?v=6.3.2 # https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7 'url': 'https://www.kvs-demo.com/embed/105/', 'md5': '1ff84c70acaddbb03288c6cc5ee1879f', 'info_dict': { 'id': '105', 'ext': 'mp4', 'title': 'Kelis - 4th Of July / Embed Player', 'age_limit': 0, 'display_id': 'kelis-4th-of-july', 'thumbnail': r're:https?://www\.kvs-demo\.com/contents/videos_screenshots/.+\.jpg', }, }, { # twitter:player:stream # https://github.com/ytdl-org/youtube-dl/commit/371ddb14fe651d4a1e5a8310d6d7c0e395cd92b0 'url': 'https://beltzlaw.com/', 'info_dict': { 'id': 'beltzlaw-1', 'ext': 'mp4', 'title': 'Beltz Law Group | Dallas Traffic Ticket, Accident & Criminal Attorney (1)', 'age_limit': 0, 'description': 'md5:5bdf23fcb76801dc3b31e74cabf82147', 'thumbnail': r're:https?://beltzlaw\.com/wp-content/uploads/.+\.jpg', 'timestamp': int, # varies 'upload_date': str, '_old_archive_ids': ['generic beltzlaw'], }, }, { # twitter:player # https://github.com/ytdl-org/youtube-dl/commit/329179073b93e37ab76e759d1fe96d8f984367f3 'url': 'https://cine.ar/', 'md5': 'd3e33335e339f04008690118698dfd08', 'info_dict': { 'id': 'cine-1', 'ext': 'webm', 'title': 'CINE.AR (1)', 'age_limit': 0, 'description': 'md5:a4e58f9e2291c940e485f34251898c4a', 'thumbnail': r're:https?://cine\.ar/img/.+\.png', '_old_archive_ids': ['generic cine'], }, 'params': {'format': 'webm'}, }, { # JSON-LD: multiple @type # https://github.com/yt-dlp/yt-dlp/commit/f3c0c77304bc0e5614a65c45629de22f067685ac 'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html', 'info_dict': { 'id': 'ipy2AcGL', 'ext': 'mp4', 'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen', 'age_limit': 0, 'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d', 'duration': 111.0, 'thumbnail': r're:https?://images\.nu\.nl/.+\.jpg', 'timestamp': 1586584674, 'upload_date': '20200411', }, 'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}}, }, { # JSON-LD: unexpected @type # https://github.com/yt-dlp/yt-dlp/pull/5145 'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/', 'info_dict': { 'id': 'porsche-911-gt3-rs-rij-impressie-2', 'ext': 'mp4', 'title': 'Test: Porsche 911 GT3 RS - AutoWeek', 'age_limit': 0, 'description': 'md5:a17b5bd84288448d8f11b838505718fc', 'direct': True, 'thumbnail': r're:https?://images\.autoweek\.nl/.+', 'timestamp': 1664920902, 'upload_date': '20221004', }, 'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}}, }, { # JSON-LD: VideoObject # https://github.com/ytdl-org/youtube-dl/commit/6e6b70d65f0681317c425bfe1e157f3474afbbe8 'url': 'https://breezy.hr/', 'info_dict': { 'id': 'k6gl2kt2eq', 'ext': 'mp4', 'title': 'Breezy HR\'s ATS helps you find & hire employees sooner', 'age_limit': 0, 'average_rating': 4.5, 'description': 'md5:eee75fdd3044c538003f3be327ba01e1', 'duration': 60.1, 'thumbnail': r're:https?://cdn\.prod\.website-files\.com/.+\.webp', 'timestamp': 1485734400, 'upload_date': '20170130', }, }, { # Video.js: VOD HLS # https://github.com/yt-dlp/yt-dlp/pull/6775 'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html', 'info_dict': { 'id': 'videojs_hls_test', 'ext': 'mp4', 'title': 'video', 'age_limit': 0, 'duration': 1800, }, 'params': {'skip_download': 'm3u8'}, }, { # Video.js: YouTube # https://github.com/ytdl-org/youtube-dl/commit/63d990d2859d0e981da2e416097655798334431b 'url': 'https://ortcam.com/solidworks-%d1%83%d1%80%d0%be%d0%ba-6-%d0%bd%d0%b0%d1%81%d1%82%d1%80%d0%be%d0%b9%d0%ba%d0%b0-%d1%87%d0%b5%d1%80%d1%82%d0%b5%d0%b6%d0%b0_33f9b7351.html?vid=33f9b7351', 'info_dict': { 'id': 'yygqldloqIk', 'ext': 'mp4', 'title': 'SolidWorks. Урок 6 Настройка чертежа', 'age_limit': 0, 'availability': 'public', 'categories': ['Education'], 'channel': 'PROстое3D', 'channel_follower_count': int, 'channel_id': 'UCy91Bug3dERhbwGh2m2Ijng', 'channel_url': 'https://www.youtube.com/channel/UCy91Bug3dERhbwGh2m2Ijng', 'comment_count': int, 'description': 'md5:baf95267792646afdbf030e4d06b2ab3', 'duration': 1160, 'heatmap': 'count:100', 'like_count': int, 'live_status': 'not_live', 'media_type': 'video', 'playable_in_embed': True, 'tags': 'count:17', 'thumbnail': r're:https?://i\.ytimg\.com/vi/.+', 'timestamp': 1363263144, 'upload_date': '20130314', 'uploader': 'PROстое3D', 'uploader_id': '@PROstoe3D', 'uploader_url': 'https://www.youtube.com/@PROstoe3D', 'view_count': int, }, 'add_ie': ['Youtube'], }, { # Redirect # https://github.com/ytdl-org/youtube-dl/issues/413 'url': 'https://www.google.com/url?rct=j&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY', 'info_dict': { 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'title': 'First Firefox OS phones side-by-side', 'age_limit': 0, 'availability': 'public', 'categories': ['Entertainment'], 'channel': 'The Verge', 'channel_follower_count': int, 'channel_id': 'UCddiUEpeqJcYeBxX1IVBKvQ', 'channel_is_verified': True, 'channel_url': 'https://www.youtube.com/channel/UCddiUEpeqJcYeBxX1IVBKvQ', 'comment_count': int, 'description': 'md5:7a676046ad24d9ea55cdde4a6657c5b3', 'duration': 207, 'like_count': int, 'live_status': 'not_live', 'media_type': 'video', 'playable_in_embed': True, 'tags': 'count:15', 'thumbnail': r're:https?://i\.ytimg\.com/vi/.+', 'timestamp': 1361738430, 'upload_date': '20130224', 'uploader': 'The Verge', 'uploader_id': '@TheVerge', 'uploader_url': 'https://www.youtube.com/@TheVerge', 'view_count': int, }, 'add_ie': ['Youtube'], }] def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen(f'[redirect] Following redirect to {new_url}') def report_detected(self, name, num=1, note=None): if num > 1: name += 's' elif not num: return else: num = 'a' self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') def _extra_manifest_info(self, info, manifest_url): fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0] if fragment_query is not None: info['extra_param_to_segment_url'] = ( urllib.parse.urlparse(fragment_query).query or fragment_query or urllib.parse.urlparse(manifest_url).query or None) key_query = self._configuration_arg('key_query', [None], casesense=True)[0] if key_query is not None: info['extra_param_to_key_url'] = ( urllib.parse.urlparse(key_query).query or key_query or urllib.parse.urlparse(manifest_url).query or None) def hex_or_none(value): return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), { 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}), }) or None variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0] if variant_query is not None: query = urllib.parse.parse_qs( urllib.parse.urlparse(variant_query).query or variant_query or urllib.parse.urlparse(manifest_url).query) for fmt in self._downloader._get_formats(info): fmt['url'] = update_url_query(fmt['url'], query) # Attempt to detect live HLS or set VOD duration m3u8_format = next((f for f in self._downloader._get_formats(info) if determine_protocol(f) == 'm3u8_native'), None) if m3u8_format: is_live = self._configuration_arg('is_live', [None])[0] if is_live is not None: info['live_status'] = 'not_live' if is_live == 'false' else 'is_live' return headers = m3u8_format.get('http_headers') or info.get('http_headers') or {} display_id = info.get('id') urlh = self._request_webpage( m3u8_format['url'], display_id, 'Checking m3u8 live status', errnote=False, headers={**headers, 'Accept-Encoding': 'identity'}, fatal=False) if urlh is False: return first_bytes = urlh.read(512) if not first_bytes.startswith(b'#EXTM3U'): return m3u8_doc = self._webpage_read_content( urlh, urlh.url, display_id, prefix=first_bytes, fatal=False, errnote=False) if not m3u8_doc: return duration = self._parse_m3u8_vod_duration(m3u8_doc, display_id) if not duration: info['live_status'] = 'is_live' info['duration'] = info.get('duration') or duration def _extract_rss(self, url, video_id, doc): NS_MAP = { 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', } entries = [] for it in doc.findall('./channel/item'): next_url = next( (e.attrib.get('url') for e in it.findall('./enclosure')), xpath_text(it, 'link', fatal=False)) if not next_url: continue guid = try_call(lambda: it.find('guid').text) if guid: next_url = smuggle_url(next_url, {'force_videoid': guid}) def itunes(key): return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None) entries.append({ '_type': 'url_transparent', 'url': next_url, 'title': try_call(lambda: it.find('title').text), 'description': xpath_text(it, 'description', default=None), 'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)), 'duration': parse_duration(itunes('duration')), 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), 'episode': itunes('title'), 'episode_number': int_or_none(itunes('episode')), 'season_number': int_or_none(itunes('season')), 'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()), }) return { '_type': 'playlist', 'id': url, 'title': try_call(lambda: doc.find('./channel/title').text), 'description': try_call(lambda: doc.find('./channel/description').text), 'entries': entries, } @classmethod def _kvs_get_real_url(cls, video_url, license_code): if not video_url.startswith('function/0/'): return video_url # not obfuscated parsed = urllib.parse.urlparse(video_url[len('function/0/'):]) license_token = cls._kvs_get_license_token(license_code) urlparts = parsed.path.split('/') HASH_LENGTH = 32 hash_ = urlparts[3][:HASH_LENGTH] indices = list(range(HASH_LENGTH)) # Swap indices of hash according to the destination calculated from the license token accum = 0 for src in reversed(range(HASH_LENGTH)): accum += license_token[src] dest = (src + accum) % HASH_LENGTH indices[src], indices[dest] = indices[dest], indices[src] urlparts[3] = ''.join(hash_[index] for index in indices) + urlparts[3][HASH_LENGTH:] return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts))) @staticmethod def _kvs_get_license_token(license_code): license_code = license_code.replace('$', '') license_values = [int(char) for char in license_code] modlicense = license_code.replace('0', '1') center = len(modlicense) // 2 fronthalf = int(modlicense[:center + 1]) backhalf = int(modlicense[center:]) modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1] return [ (license_values[index + offset] + current) % 10 for index, current in enumerate(map(int, modlicense)) for offset in range(4) ] def _extract_kvs(self, url, webpage, video_id): flashvars = self._search_json( r'(?s:]*>.*?var\s+flashvars\s*=)', webpage, 'flashvars', video_id, transform_source=js_to_json) # extract the part after the last / as the display_id from the # canonical URL. display_id = self._search_regex( r'(?:' r'|)', webpage, 'display_id', fatal=False) title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)', webpage, 'title') thumbnail = flashvars['preview_url'] if thumbnail.startswith('//'): protocol, _, _ = url.partition('/') thumbnail = protocol + thumbnail url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys())) formats = [] for key in url_keys: if '/get_file/' not in flashvars[key]: continue format_id = flashvars.get(f'{key}_text', key) formats.append({ 'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])), 'format_id': format_id, 'ext': 'mp4', **(parse_resolution(format_id) or parse_resolution(flashvars[key])), 'http_headers': {'Referer': url}, }) if not formats[-1].get('height'): formats[-1]['quality'] = 1 return { 'id': flashvars['video_id'], 'display_id': display_id, 'title': title, 'thumbnail': urljoin(url, thumbnail), 'formats': formats, } def _real_extract(self, url): if url.startswith('//'): return self.url_result(self.http_scheme() + url) parsed_url = urllib.parse.urlparse(url) if not parsed_url.scheme: default_search = self.get_param('default_search') if default_search is None: default_search = 'fixup_error' if default_search in ('auto', 'auto_warning', 'fixup_error'): if re.match(r'[^\s/]+\.[^\s/]+/', url): self.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) elif default_search != 'fixup_error': if default_search == 'auto_warning': if re.match(r'^(?:url|URL)$', url): raise ExtractorError( f'Invalid URL: {url!r} . Call yt-dlp like this: yt-dlp -v "https://www.youtube.com/watch?v=BaW_jenozKc" ', expected=True) else: self.report_warning( f'Falling back to youtube search for {url} . Set --default-search "auto" to suppress this warning.') return self.url_result('ytsearch:' + url) if default_search in ('error', 'fixup_error'): raise ExtractorError( f'{url!r} is not a valid URL. ' f'Set --default-search "ytsearch" (or run yt-dlp "ytsearch:{url}" ) to search YouTube', expected=True) else: if ':' not in default_search: default_search += ':' return self.url_result(default_search + url) original_url = url url, smuggled_data = unsmuggle_url(url, {}) force_videoid = None is_intentional = smuggled_data.get('to_generic') if 'force_videoid' in smuggled_data: force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: video_id = self._generic_id(url) # Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335 impersonate = self._configuration_arg('impersonate', ['false']) if 'false' in impersonate: impersonate = None # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) # making it impossible to download only chunk of the file (yet we need only 512kB to # test whether it's HTML or not). According to yt-dlp default Accept-Encoding # that will always result in downloading the whole file that is not desirable. # Therefore for extraction pass we have to override Accept-Encoding to any in order # to accept raw bytes and being able to download only a chunk. # It may probably better to solve this by checking Content-Type for application/octet-stream # after a HEAD request, but not sure if we can rely on this. try: full_response = self._request_webpage(url, video_id, headers=filter_dict({ 'Accept-Encoding': 'identity', 'Referer': smuggled_data.get('referer'), }), impersonate=impersonate) except ExtractorError as e: if not (isinstance(e.cause, HTTPError) and e.cause.status == 403 and e.cause.response.get_header('cf-mitigated') == 'challenge' and e.cause.response.extensions.get('impersonate') is None): raise cf_cookie_domain = traverse_obj( LenientSimpleCookie(e.cause.response.get_header('set-cookie')), ('__cf_bm', 'domain')) if cf_cookie_domain: self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}') self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm') msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; ' if not self._downloader._impersonate_target_available(ImpersonateTarget()): msg += ('see https://github.com/yt-dlp/yt-dlp#impersonation for ' 'how to install the required impersonation dependency, and ') raise ExtractorError( f'{msg}try again with --extractor-args "generic:impersonate"', expected=True) new_url = full_response.url if new_url != extract_basic_auth(url)[0]: self.report_following_redirect(new_url) if force_videoid: new_url = smuggle_url(new_url, {'force_videoid': force_videoid}) return self.url_result(new_url) info_dict = { 'id': video_id, 'title': self._generic_title(url), 'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')), } # Check for direct link to a video content_type = full_response.headers.get('Content-Type', '').lower() m = re.match(r'(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) if m: self.report_detected('direct video link') headers = filter_dict({'Referer': smuggled_data.get('referer')}) format_id = str(m.group('format_id')) ext = determine_ext(url, default_ext=None) or urlhandle_detect_ext(full_response) subtitles = {} if format_id.endswith('mpegurl') or ext == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) elif format_id == 'f4m' or ext == 'f4m': formats = self._extract_f4m_formats(url, video_id, headers=headers) # Don't check for DASH/mpd here, do it later w/ first_bytes. Same number of requests either way else: formats = [{ 'format_id': format_id, 'url': url, 'ext': ext, 'vcodec': 'none' if m.group('type') == 'audio' else None, }] info_dict['direct'] = True info_dict.update({ 'formats': formats, 'subtitles': subtitles, 'http_headers': headers or None, }) self._extra_manifest_info(info_dict, url) return info_dict if not self.get_param('test', False) and not is_intentional: force = self.get_param('force_generic_extractor', False) self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on')) first_bytes = full_response.read(512) # Is it an M3U playlist? if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') self._extra_manifest_info(info_dict, url) return info_dict # Maybe it's a direct link to a video? # Be careful not to download the whole thing! if not is_html(first_bytes): self.report_warning( 'URL could be a direct video link, returning it as such.') ext = determine_ext(url) if ext not in _UnsafeExtensionError.ALLOWED_EXTENSIONS: ext = 'unknown_video' info_dict.update({ 'direct': True, 'url': url, 'ext': ext, }) return info_dict webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) if 'DPG Media Privacy Gate' in webpage: webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? try: try: doc = compat_etree_fromstring(webpage) except xml.etree.ElementTree.ParseError: doc = compat_etree_fromstring(webpage.encode()) if doc.tag == 'rss': self.report_detected('RSS feed') return self._extract_rss(url, video_id, doc) elif doc.tag == 'SmoothStreamingMedia': info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) self.report_detected('ISM manifest') return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): smil = self._parse_smil(doc, url, video_id) self.report_detected('SMIL file') return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': self.report_detected('XSPF playlist') return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, xspf_base_url=new_url), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, # Do not use yt_dlp.utils.base_url here since it will raise on file:// URLs mpd_base_url=update_url(new_url, query=None, fragment=None).rpartition('/')[0], mpd_url=url) info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None self._extra_manifest_info(info_dict, url) self.report_detected('DASH manifest') return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) self.report_detected('F4M manifest') return info_dict except xml.etree.ElementTree.ParseError: pass info_dict.update({ # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical 'title': self._generic_title('', webpage, default='video'), 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'age_limit': self._rta_search(webpage), }) self._downloader.write_debug('Looking for embeds') embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict)) if len(embeds) == 1: return merge_dicts(embeds[0], info_dict) elif embeds: return self.playlist_result(embeds, **info_dict) raise UnsupportedError(url) def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): """Returns an iterator of video entries""" info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url) url, smuggled_data = unsmuggle_url(url, {}) actual_url = urlh.url if urlh else url # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way # FIXME: unescaping the whole page may break URLs, commenting out for now. # There probably should be a second run of generic extractor on unescaped webpage. # webpage = urllib.parse.unquote(webpage) embeds = [] for ie in self._downloader._ies.values(): if ie.ie_key() in smuggled_data.get('block_ies', []): continue gen = ie.extract_from_webpage(self._downloader, url, webpage) current_embeds = [] try: while True: current_embeds.append(next(gen)) except self.StopExtraction: self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds), embeds and 'discarding other embeds') return current_embeds except StopIteration: self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds)) embeds.extend(current_embeds) if embeds: return embeds jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: if isinstance(jwplayer_data.get('playlist'), str): self.report_detected('JW Player playlist') return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')] try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) if traverse_obj(info, 'formats', ('entries', ..., 'formats')): self.report_detected('JW Player data') return [info] except ExtractorError: # See https://github.com/ytdl-org/youtube-dl/pull/16735 pass # Video.js embed mobj = re.search( r'(?s)\bvideojs\s*\(.+?([a-zA-Z0-9_$]+)\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', webpage) if mobj is not None: varname = mobj.group(1) sources = variadic(self._parse_json( mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or []) formats, subtitles, src = [], {}, None for source in sources: src = source.get('src') if not src or not isinstance(src, str): continue src = urllib.parse.urljoin(url, src) src_type = source.get('type') if isinstance(src_type, str): src_type = src_type.lower() ext = determine_ext(src).lower() if src_type == 'video/youtube': return [self.url_result(src, YoutubeIE.ie_key())] if src_type == 'application/dash+xml' or ext == 'mpd': fmts, subs = self._extract_mpd_formats_and_subtitles( src, video_id, mpd_id='dash', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif src_type == 'application/x-mpegurl' or ext == 'm3u8': fmts, subs = self._extract_m3u8_formats_and_subtitles( src, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) if not formats: formats.append({ 'url': src, 'ext': (mimetype2ext(src_type) or ext if ext in KNOWN_EXTENSIONS else 'mp4'), 'http_headers': { 'Referer': actual_url, }, }) # https://docs.videojs.com/player#addRemoteTextTrack # https://html.spec.whatwg.org/multipage/media.html#htmltrackelement for sub_match in re.finditer(rf'(?s){re.escape(varname)}' + r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage): sub = self._parse_json( sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {} sub_src = str_or_none(sub.get('src')) if not sub_src: continue subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ 'url': urllib.parse.urljoin(url, sub_src), 'name': sub.get('label'), 'http_headers': { 'Referer': actual_url, }, }) if formats or subtitles: self.report_detected('video.js embed') info_dict = {'formats': formats, 'subtitles': subtitles} if formats: self._extra_manifest_info(info_dict, src) return [info_dict] # Look for generic KVS player (before json-ld bc of some urls that break otherwise) found = self._search_regex(( r']+?\bsrc\s*=\s*(["\'])https?://(?:(?!\1)[^?#])+/kt_player\.js\?v=(?P\d+(?:\.\d+)+)\1[^>]*>', r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:(?!\2)[^?#])+/kt_player\.swf\?v=(?P\d+(?:\.\d+)+)\2\s*,', ), webpage, 'KVS player', group='ver', default=False) if found: self.report_detected('KVS Player') if found.split('.')[0] not in ('4', '5', '6'): self.report_warning(f'Untested major version ({found}) in player engine - download may fail.') return [self._extract_kvs(url, webpage, video_id)] # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): self.report_detected('JSON LD') is_direct = json_ld.get('ext') not in (None, *MEDIA_EXTENSIONS.manifests) return [merge_dicts({ '_type': 'video' if is_direct else 'url_transparent', 'url': smuggle_url(json_ld['url'], { 'force_videoid': video_id, 'to_generic': True, 'referer': url, }), }, json_ld)] def check_video(vurl): if YoutubeIE.suitable(vurl): return True if RtmpIE.suitable(vurl): return True vpath = urllib.parse.urlparse(vurl).path vext = determine_ext(vpath, None) return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') def filter_video(urls): return list(filter(check_video, urls)) # Start with something easy: JW Player in SWFObject found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) if found: self.report_detected('JW Player in SFWObject') else: # Look for gorilla-vid style embedding found = filter_video(re.findall(r'''(?sx) (?: jw_plugins| JWPlayerOptions| jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup ) .*? ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) if found: self.report_detected('JW Player embed') if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) if found: self.report_detected('video file') if not found: # Broaden the findall a little bit: JWPlayer JS loader found = filter_video(re.findall( r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) if found: self.report_detected('JW Player JS loader') if not found: # Flow player found = filter_video(re.findall(r'''(?xs) flowplayer\("[^"]+",\s* \{[^}]+?\}\s*, \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s* ["']?url["']?\s*:\s*["']([^"']+)["'] ''', webpage)) if found: self.report_detected('Flow Player') if not found: # Cinerama player found = re.findall( r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) if found: self.report_detected('Cinerama player') if not found: # Try to find twitter cards info # twitter:player:stream should be checked before twitter:player since # it is expected to contain a raw stream (see # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) found = filter_video(re.findall( r'