mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-08-03 10:50:28 +00:00

Some checks failed
CodeQL / Analyze (python) (push) Has been cancelled
Download Tests / Quick Download Tests (push) Has been cancelled
Download Tests / Full Download Tests (ubuntu-latest, 3.10) (push) Has been cancelled
Download Tests / Full Download Tests (ubuntu-latest, 3.11) (push) Has been cancelled
Download Tests / Full Download Tests (ubuntu-latest, 3.12) (push) Has been cancelled
Download Tests / Full Download Tests (ubuntu-latest, 3.13) (push) Has been cancelled
Download Tests / Full Download Tests (ubuntu-latest, pypy-3.11) (push) Has been cancelled
Download Tests / Full Download Tests (windows-latest, 3.9) (push) Has been cancelled
Download Tests / Full Download Tests (windows-latest, pypy-3.11) (push) Has been cancelled
Quick Test / Core Test (push) Has been cancelled
Quick Test / Code check (push) Has been cancelled
Release (master) / release (push) Has been cancelled
Release (master) / publish_pypi (push) Has been cancelled
Authored by: doe1080
1316 lines
58 KiB
Python
1316 lines
58 KiB
Python
import os
|
||
import re
|
||
import types
|
||
import urllib.parse
|
||
import xml.etree.ElementTree
|
||
|
||
from .common import InfoExtractor
|
||
from .commonprotocols import RtmpIE
|
||
from .youtube import YoutubeIE
|
||
from ..compat import compat_etree_fromstring
|
||
from ..cookies import LenientSimpleCookie
|
||
from ..networking.exceptions import HTTPError
|
||
from ..networking.impersonate import ImpersonateTarget
|
||
from ..utils import (
|
||
KNOWN_EXTENSIONS,
|
||
MEDIA_EXTENSIONS,
|
||
ExtractorError,
|
||
UnsupportedError,
|
||
determine_ext,
|
||
determine_protocol,
|
||
dict_get,
|
||
extract_basic_auth,
|
||
filter_dict,
|
||
format_field,
|
||
int_or_none,
|
||
is_html,
|
||
js_to_json,
|
||
merge_dicts,
|
||
mimetype2ext,
|
||
orderedSet,
|
||
parse_duration,
|
||
parse_resolution,
|
||
smuggle_url,
|
||
str_or_none,
|
||
traverse_obj,
|
||
try_call,
|
||
unescapeHTML,
|
||
unified_timestamp,
|
||
unsmuggle_url,
|
||
update_url,
|
||
update_url_query,
|
||
url_or_none,
|
||
urlhandle_detect_ext,
|
||
urljoin,
|
||
variadic,
|
||
xpath_attr,
|
||
xpath_text,
|
||
xpath_with_ns,
|
||
)
|
||
from ..utils._utils import _UnsafeExtensionError
|
||
|
||
|
||
class GenericIE(InfoExtractor):
|
||
IE_DESC = 'Generic downloader that works on some sites'
|
||
_VALID_URL = r'.*'
|
||
IE_NAME = 'generic'
|
||
_NETRC_MACHINE = False # Suppress username warning
|
||
_TESTS = [{
|
||
# Direct link
|
||
# https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
|
||
'url': 'https://media.w3.org/2010/05/sintel/trailer.mp4',
|
||
'md5': '67d406c2bcb6af27fa886f31aa934bbe',
|
||
'info_dict': {
|
||
'id': 'trailer',
|
||
'ext': 'mp4',
|
||
'title': 'trailer',
|
||
'direct': True,
|
||
'timestamp': 1273772943,
|
||
'upload_date': '20100513',
|
||
},
|
||
}, {
|
||
# Direct link: No HEAD support
|
||
# https://github.com/ytdl-org/youtube-dl/issues/4032
|
||
'url': 'http://ai-radio.org:8000/radio.opus',
|
||
'info_dict': {
|
||
'id': 'radio',
|
||
'ext': 'opus',
|
||
'title': 'radio',
|
||
},
|
||
'skip': 'Invalid URL',
|
||
}, {
|
||
# Direct link: Incorrect MIME type
|
||
# https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
|
||
'url': 'https://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
|
||
'md5': '4ccbebe5f36706d85221f204d7eb5913',
|
||
'info_dict': {
|
||
'id': '5_Lennart_Poettering_-_Systemd',
|
||
'ext': 'webm',
|
||
'title': '5_Lennart_Poettering_-_Systemd',
|
||
'direct': True,
|
||
'timestamp': 1416498816,
|
||
'upload_date': '20141120',
|
||
},
|
||
}, {
|
||
# Direct link: Live HLS; https://castr.com/hlsplayer/
|
||
# https://github.com/yt-dlp/yt-dlp/pull/6775
|
||
'url': 'https://stream-akamai.castr.com/5b9352dbda7b8c769937e459/live_2361c920455111ea85db6911fe397b9e/index.fmp4.m3u8',
|
||
'info_dict': {
|
||
'id': 'index.fmp4',
|
||
'ext': 'mp4',
|
||
'title': str,
|
||
'live_status': 'is_live',
|
||
},
|
||
'params': {'skip_download': 'm3u8'},
|
||
}, {
|
||
# Compressed when `Accept-Encoding: *`
|
||
# https://github.com/ytdl-org/youtube-dl/commit/a074e922967fa571d4f1abb1773c711747060f00
|
||
'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
|
||
'info_dict': {
|
||
'id': 'FictionJunction-Parallel_Hearts',
|
||
'ext': 'flac',
|
||
'title': 'FictionJunction-Parallel_Hearts',
|
||
},
|
||
'skip': 'Invalid URL',
|
||
}, {
|
||
# `Content-Encoding: br` when `Accept-Encoding: *`
|
||
# https://github.com/yt-dlp/yt-dlp/commit/3e01ce744a981d8f19ae77ec695005e7000f4703
|
||
'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
|
||
'md5': 'a9a2cad3e54f78e4680c6deef82417e9',
|
||
'info_dict': {
|
||
'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
|
||
'ext': 'mp4',
|
||
'title': 'čauky lidi 70 finall',
|
||
'age_limit': 0,
|
||
'description': 'md5:47b2673a5b76780d9d329783e1fbf5aa',
|
||
'direct': True,
|
||
'duration': 318.0,
|
||
'thumbnail': r're:https?://media\.extra\.cz/static/img/.+\.jpg',
|
||
'timestamp': 1654513791,
|
||
'upload_date': '20220606',
|
||
},
|
||
'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
|
||
}, {
|
||
# HLS: `Content-Type: audio/mpegurl`; https://bitmovin.com/demos/stream-test
|
||
# https://github.com/ytdl-org/youtube-dl/commit/20938f768b16c945c6041ba3c0a7ae1a4e790881
|
||
'url': 'https://cdn.bitmovin.com/content/assets/art-of-motion-dash-hls-progressive/m3u8s/f08e80da-bf1d-4e3d-8899-f0f6155f6efa.m3u8',
|
||
'info_dict': {
|
||
'id': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
|
||
'ext': 'mp4',
|
||
'title': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
|
||
'duration': 211,
|
||
'timestamp': 1737363648,
|
||
'upload_date': '20250120',
|
||
},
|
||
'params': {'skip_download': 'm3u8'},
|
||
}, {
|
||
# HLS: `Content-Type: text/plain`; https://github.com/grafov/m3u8
|
||
# https://github.com/ytdl-org/youtube-dl/commit/edd9b71c2cca7e5a0df8799710d9ad410ec77d29
|
||
'url': 'https://raw.githubusercontent.com/grafov/m3u8/refs/heads/master/sample-playlists/master.m3u8',
|
||
'info_dict': {
|
||
'id': 'master',
|
||
'ext': 'mp4',
|
||
'title': 'master',
|
||
},
|
||
'params': {'skip_download': 'm3u8'},
|
||
}, {
|
||
# MPEG-DASH; https://bitmovin.com/demos/stream-test
|
||
# https://github.com/ytdl-org/youtube-dl/commit/9d939cec48f06a401fb79eb078c1fc50b2aefbe1
|
||
'url': 'https://cdn.bitmovin.com/content/assets/art-of-motion-dash-hls-progressive/mpds/f08e80da-bf1d-4e3d-8899-f0f6155f6efa.mpd',
|
||
'info_dict': {
|
||
'id': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
|
||
'ext': 'mp4',
|
||
'title': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
|
||
'timestamp': 1737363728,
|
||
'upload_date': '20250120',
|
||
},
|
||
'params': {'skip_download': True},
|
||
}, {
|
||
# Live MPEG-DASH; https://livesim2.dashif.org/urlgen/create
|
||
# https://github.com/yt-dlp/yt-dlp/pull/12256
|
||
'url': 'https://livesim2.dashif.org/livesim2/ato_10/testpic_2s/Manifest.mpd',
|
||
'info_dict': {
|
||
'id': 'Manifest',
|
||
'ext': 'mp4',
|
||
'title': str,
|
||
'live_status': 'is_live',
|
||
},
|
||
'params': {'skip_download': 'livestream'},
|
||
}, {
|
||
# SMIL
|
||
# https://github.com/ytdl-org/youtube-dl/pull/6428
|
||
'url': 'https://api.new.livestream.com/accounts/21/events/7954027/videos/166558123.secure.smil',
|
||
'info_dict': {
|
||
'id': '166558123.secure',
|
||
'ext': 'mp4',
|
||
'title': '73fb2379-a624-4b6c-bce4-e46086007f2c',
|
||
},
|
||
'params': {'skip_download': 'smil'},
|
||
}, {
|
||
# XSPF playlist; https://shellac-archive.ch/de/index.html
|
||
# https://github.com/ytdl-org/youtube-dl/commit/1de5cd3ba51ce67d9a1cd3b40157058e78e46692
|
||
'url': 'https://shellac-archive.ch/repository/xspf/22-AL0019Z.xspf',
|
||
'info_dict': {
|
||
'id': '22-AL0019Z',
|
||
},
|
||
'playlist_count': 12,
|
||
'params': {'skip_download': True},
|
||
}, {
|
||
# RSS feed
|
||
# https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
|
||
'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
|
||
'info_dict': {
|
||
'id': 'https://phihag.de/2014/youtube-dl/rss2.xml',
|
||
'title': 'Zero Punctuation',
|
||
'description': 'md5:512ae5f840e52eb3c0d08d4bed08eb3e',
|
||
},
|
||
'playlist_mincount': 11,
|
||
}, {
|
||
# RSS feed: Includes enclosure, description, and thumbnails
|
||
# https://github.com/ytdl-org/youtube-dl/pull/27405
|
||
'url': 'https://anchor.fm/s/dd00e14/podcast/rss',
|
||
'info_dict': {
|
||
'id': 'https://anchor.fm/s/dd00e14/podcast/rss',
|
||
'title': '100% Hydrogen ',
|
||
'description': 'md5:7ec96327f8b91a2549a2e74f064022a1',
|
||
},
|
||
'playlist_count': 1,
|
||
'params': {'skip_download': True},
|
||
}, {
|
||
# RSS feed: Includes guid
|
||
'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
|
||
'info_dict': {
|
||
'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
|
||
'title': 'The Little Red Podcast',
|
||
'description': 'md5:be809a44b63b0c56fb485caf68685520',
|
||
},
|
||
'playlist_mincount': 76,
|
||
}, {
|
||
# RSS feed: Includes enclosure and unsupported URLs
|
||
# https://github.com/ytdl-org/youtube-dl/pull/16189
|
||
'url': 'https://www.interfax.ru/rss.asp',
|
||
'info_dict': {
|
||
'id': 'https://www.interfax.ru/rss.asp',
|
||
'title': 'Интерфакс',
|
||
'description': 'md5:49b6b8905772efba21923942bbc0444c',
|
||
},
|
||
'playlist_mincount': 25,
|
||
}, {
|
||
# Webpage starts with a duplicate UTF-8 BOM
|
||
# https://github.com/yt-dlp/yt-dlp/commit/80e8493ee7c3083f4e215794e4a67ba5265f24f7
|
||
'url': 'https://www.filmarkivet.se/movies/paris-d-moll/',
|
||
'md5': 'df02cadc719dcc63d43288366f037754',
|
||
'info_dict': {
|
||
'id': 'paris-d-moll',
|
||
'ext': 'mp4',
|
||
'title': 'Paris d-moll',
|
||
'age_limit': 0,
|
||
'description': 'md5:319e37ea5542293db37e1e13072fe330',
|
||
'thumbnail': r're:https?://www\.filmarkivet\.se/wp-content/uploads/.+\.jpg',
|
||
},
|
||
}, {
|
||
# Multiple HTML5 videos
|
||
# https://github.com/ytdl-org/youtube-dl/pull/14107
|
||
'url': 'https://www.dagbladet.no/nyheter/etter-ett-ars-planlegging-klaffet-endelig-alt---jeg-matte-ta-en-liten-dans/60413035',
|
||
'info_dict': {
|
||
'id': '60413035',
|
||
'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans',
|
||
'age_limit': 0,
|
||
'description': 'md5:bbb4e12e42e78609a74fd421b93b1239',
|
||
'thumbnail': r're:https?://www\.dagbladet\.no/images/.+',
|
||
},
|
||
'playlist_count': 2,
|
||
}, {
|
||
# Cinerama Player
|
||
# https://github.com/ytdl-org/youtube-dl/commit/501f13fbf3d1f7225f91e3e0ad008df2cd3219f1
|
||
'url': 'https://www.abc.net.au/res/libraries/cinerama2/examples/single_clip.htm',
|
||
'info_dict': {
|
||
'id': 'single_clip',
|
||
'title': 'Single Clip player examples',
|
||
'age_limit': 0,
|
||
},
|
||
'playlist_count': 3,
|
||
}, {
|
||
# FIXME: Improve extraction
|
||
# Flowplayer
|
||
# https://github.com/ytdl-org/youtube-dl/commit/4d805e063c6c4ffd557d7c7cb905a3ed9c926b08
|
||
'url': 'https://flowplayer.com/resources/demos/standard-setup',
|
||
'info_dict': {
|
||
'id': 'playlist',
|
||
'ext': 'mp4',
|
||
'title': 'playlist',
|
||
'duration': 13,
|
||
'timestamp': 1539082175,
|
||
'upload_date': '20181009',
|
||
},
|
||
'params': {'skip_download': 'm3u8'},
|
||
}, {
|
||
# JW Player: YouTube
|
||
# https://github.com/ytdl-org/youtube-dl/commit/a0f719854463c6f4226e4042dfa80c1b17154e1d
|
||
'url': 'https://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
|
||
'info_dict': {
|
||
'id': 'Mrj4DVp2zeA',
|
||
'ext': 'mp4',
|
||
'title': 'Using Discovery, The National Archives’ online catalogue',
|
||
'age_limit': 0,
|
||
'availability': 'unlisted',
|
||
'categories': ['Education'],
|
||
'channel': 'The National Archives UK',
|
||
'channel_follower_count': int,
|
||
'channel_id': 'UCUuzebc1yADDJEnOLA5P9xw',
|
||
'channel_url': 'https://www.youtube.com/channel/UCUuzebc1yADDJEnOLA5P9xw',
|
||
'chapters': 'count:13',
|
||
'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
|
||
'duration': 3066,
|
||
'like_count': int,
|
||
'live_status': 'not_live',
|
||
'media_type': 'video',
|
||
'playable_in_embed': True,
|
||
'tags': 'count:5',
|
||
'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
|
||
'timestamp': 1423757117,
|
||
'upload_date': '20150212',
|
||
'uploader': 'The National Archives UK',
|
||
'uploader_id': '@TheNationalArchivesUK',
|
||
'uploader_url': 'https://www.youtube.com/@TheNationalArchivesUK',
|
||
'view_count': int,
|
||
},
|
||
'add_ie': ['Youtube'],
|
||
}, {
|
||
# JW Player: Complex
|
||
# https://github.com/ytdl-org/youtube-dl/commit/a4a554a79354981fcab55de8eaab7b95a40bbb48
|
||
'url': 'https://www.indiedb.com/games/king-machine/videos',
|
||
'info_dict': {
|
||
'id': 'videos-1',
|
||
'ext': 'mp4',
|
||
'title': 'Videos & Audio - King Machine (1)',
|
||
'age_limit': 0,
|
||
'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.',
|
||
'thumbnail': r're:https?://media\.indiedb\.com/cache/images/.+\.jpg',
|
||
'_old_archive_ids': ['generic videos'],
|
||
},
|
||
}, {
|
||
# JW Player: JSON Feed URL
|
||
# https://github.com/yt-dlp/yt-dlp/issues/1476
|
||
'url': 'https://foodschmooze.org/',
|
||
'info_dict': {
|
||
'id': 'z00Frhnw',
|
||
'ext': 'mp4',
|
||
'title': 'Grilling Beef Tenderloin',
|
||
'description': '',
|
||
'duration': 392.0,
|
||
'thumbnail': r're:https?://cdn\.jwplayer\.com/v2/media/.+',
|
||
'timestamp': 1465313685,
|
||
'upload_date': '20160607',
|
||
},
|
||
'params': {'skip_download': 'm3u8'},
|
||
}, {
|
||
# JW Player: RTMP
|
||
# https://github.com/ytdl-org/youtube-dl/issues/11993
|
||
'url': 'http://www.suffolk.edu/sjc/live.php',
|
||
'info_dict': {
|
||
'id': 'live',
|
||
'ext': 'flv',
|
||
'title': 'Massachusetts Supreme Judicial Court Oral Arguments',
|
||
},
|
||
'skip': 'Invalid URL',
|
||
}, {
|
||
# KVS Player v7.3.3
|
||
# kt_player.js?v=5.1.1
|
||
'url': 'https://bogmedia.org/videos/21217/40-nochey-2016/',
|
||
'md5': '94166bdb26b4cb1fb9214319a629fc51',
|
||
'info_dict': {
|
||
'id': '21217',
|
||
'ext': 'mp4',
|
||
'title': '40 ночей (2016) - BogMedia.org',
|
||
'age_limit': 0,
|
||
'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
|
||
'display_id': '40-nochey-2016',
|
||
'thumbnail': r're:https?://bogmedia\.org/contents/videos_screenshots/.+\.jpg',
|
||
},
|
||
}, {
|
||
# KVS Player v7.7.11
|
||
# kt_player.js?v=5.5.1
|
||
# https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
|
||
'url': 'https://youix.com/video/leningrad-zoj/',
|
||
'md5': '94f96ba95706dc3880812b27b7d8a2b8',
|
||
'info_dict': {
|
||
'id': '18485',
|
||
'ext': 'mp4',
|
||
'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
|
||
'age_limit': 0,
|
||
'display_id': 'leningrad-zoj',
|
||
'thumbnail': r're:https?://youix\.com/contents/videos_screenshots/.+\.jpg',
|
||
},
|
||
}, {
|
||
# KVS Player v7.10.3
|
||
# kt_player.js?v=12
|
||
# https://github.com/ytdl-org/youtube-dl/commit/fc2beab0e701c497a003f11fef5c0df54fba1da3
|
||
'url': 'https://shooshtime.com/videos/346037/fresh-out-of-the-shower/',
|
||
'md5': 'c9a97ad528607a4516d4df83a3aeb12c',
|
||
'info_dict': {
|
||
'id': '346037',
|
||
'ext': 'mp4',
|
||
'title': 'Fresh out of the shower - Shooshtime',
|
||
'age_limit': 18,
|
||
'description': 'md5:efd70fd3973f8750d285c743b910580a',
|
||
'display_id': 'fresh-out-of-the-shower',
|
||
'thumbnail': r're:https?://i\.shoosh\.co/contents/videos_screenshots/.+\.jpg',
|
||
},
|
||
'expected_warnings': ['Untested major version'],
|
||
}, {
|
||
# FIXME: Unable to extract flashvars
|
||
# KVS Player v7.11.4
|
||
# kt_player.js?v=2.11.5.1
|
||
# https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
|
||
'url': 'https://www.kvs-demo.com/video/105/kelis-4th-of-july/',
|
||
'info_dict': {
|
||
'id': '105',
|
||
'ext': 'mp4',
|
||
'title': 'Kelis - 4th Of July',
|
||
},
|
||
}, {
|
||
# KVS Player v7.11.4
|
||
# kt_player.js?v=6.3.2
|
||
# https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
|
||
'url': 'https://www.kvs-demo.com/embed/105/',
|
||
'md5': '1ff84c70acaddbb03288c6cc5ee1879f',
|
||
'info_dict': {
|
||
'id': '105',
|
||
'ext': 'mp4',
|
||
'title': 'Kelis - 4th Of July / Embed Player',
|
||
'age_limit': 0,
|
||
'display_id': 'kelis-4th-of-july',
|
||
'thumbnail': r're:https?://www\.kvs-demo\.com/contents/videos_screenshots/.+\.jpg',
|
||
},
|
||
}, {
|
||
# twitter:player:stream
|
||
# https://github.com/ytdl-org/youtube-dl/commit/371ddb14fe651d4a1e5a8310d6d7c0e395cd92b0
|
||
'url': 'https://beltzlaw.com/',
|
||
'info_dict': {
|
||
'id': 'beltzlaw-1',
|
||
'ext': 'mp4',
|
||
'title': 'Beltz Law Group | Dallas Traffic Ticket, Accident & Criminal Attorney (1)',
|
||
'age_limit': 0,
|
||
'description': 'md5:5bdf23fcb76801dc3b31e74cabf82147',
|
||
'thumbnail': r're:https?://beltzlaw\.com/wp-content/uploads/.+\.jpg',
|
||
'timestamp': int, # varies
|
||
'upload_date': str,
|
||
'_old_archive_ids': ['generic beltzlaw'],
|
||
},
|
||
}, {
|
||
# twitter:player
|
||
# https://github.com/ytdl-org/youtube-dl/commit/329179073b93e37ab76e759d1fe96d8f984367f3
|
||
'url': 'https://cine.ar/',
|
||
'md5': 'd3e33335e339f04008690118698dfd08',
|
||
'info_dict': {
|
||
'id': 'cine-1',
|
||
'ext': 'webm',
|
||
'title': 'CINE.AR (1)',
|
||
'age_limit': 0,
|
||
'description': 'md5:a4e58f9e2291c940e485f34251898c4a',
|
||
'thumbnail': r're:https?://cine\.ar/img/.+\.png',
|
||
'_old_archive_ids': ['generic cine'],
|
||
},
|
||
'params': {'format': 'webm'},
|
||
}, {
|
||
# JSON-LD: multiple @type
|
||
# https://github.com/yt-dlp/yt-dlp/commit/f3c0c77304bc0e5614a65c45629de22f067685ac
|
||
'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
|
||
'info_dict': {
|
||
'id': 'ipy2AcGL',
|
||
'ext': 'mp4',
|
||
'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen',
|
||
'age_limit': 0,
|
||
'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d',
|
||
'duration': 111.0,
|
||
'thumbnail': r're:https?://images\.nu\.nl/.+\.jpg',
|
||
'timestamp': 1586584674,
|
||
'upload_date': '20200411',
|
||
},
|
||
'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
|
||
}, {
|
||
# JSON-LD: unexpected @type
|
||
# https://github.com/yt-dlp/yt-dlp/pull/5145
|
||
'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/',
|
||
'info_dict': {
|
||
'id': 'porsche-911-gt3-rs-rij-impressie-2',
|
||
'ext': 'mp4',
|
||
'title': 'Test: Porsche 911 GT3 RS - AutoWeek',
|
||
'age_limit': 0,
|
||
'description': 'md5:a17b5bd84288448d8f11b838505718fc',
|
||
'direct': True,
|
||
'thumbnail': r're:https?://images\.autoweek\.nl/.+',
|
||
'timestamp': 1664920902,
|
||
'upload_date': '20221004',
|
||
},
|
||
'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
|
||
}, {
|
||
# JSON-LD: VideoObject
|
||
# https://github.com/ytdl-org/youtube-dl/commit/6e6b70d65f0681317c425bfe1e157f3474afbbe8
|
||
'url': 'https://breezy.hr/',
|
||
'info_dict': {
|
||
'id': 'k6gl2kt2eq',
|
||
'ext': 'mp4',
|
||
'title': 'Breezy HR\'s ATS helps you find & hire employees sooner',
|
||
'age_limit': 0,
|
||
'average_rating': 4.5,
|
||
'description': 'md5:eee75fdd3044c538003f3be327ba01e1',
|
||
'duration': 60.1,
|
||
'thumbnail': r're:https?://cdn\.prod\.website-files\.com/.+\.webp',
|
||
'timestamp': 1485734400,
|
||
'upload_date': '20170130',
|
||
},
|
||
}, {
|
||
# Video.js: VOD HLS
|
||
# https://github.com/yt-dlp/yt-dlp/pull/6775
|
||
'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html',
|
||
'info_dict': {
|
||
'id': 'videojs_hls_test',
|
||
'ext': 'mp4',
|
||
'title': 'video',
|
||
'age_limit': 0,
|
||
'duration': 1800,
|
||
},
|
||
'params': {'skip_download': 'm3u8'},
|
||
}, {
|
||
# Video.js: YouTube
|
||
# https://github.com/ytdl-org/youtube-dl/commit/63d990d2859d0e981da2e416097655798334431b
|
||
'url': 'https://ortcam.com/solidworks-%d1%83%d1%80%d0%be%d0%ba-6-%d0%bd%d0%b0%d1%81%d1%82%d1%80%d0%be%d0%b9%d0%ba%d0%b0-%d1%87%d0%b5%d1%80%d1%82%d0%b5%d0%b6%d0%b0_33f9b7351.html?vid=33f9b7351',
|
||
'info_dict': {
|
||
'id': 'yygqldloqIk',
|
||
'ext': 'mp4',
|
||
'title': 'SolidWorks. Урок 6 Настройка чертежа',
|
||
'age_limit': 0,
|
||
'availability': 'public',
|
||
'categories': ['Education'],
|
||
'channel': 'PROстое3D',
|
||
'channel_follower_count': int,
|
||
'channel_id': 'UCy91Bug3dERhbwGh2m2Ijng',
|
||
'channel_url': 'https://www.youtube.com/channel/UCy91Bug3dERhbwGh2m2Ijng',
|
||
'comment_count': int,
|
||
'description': 'md5:baf95267792646afdbf030e4d06b2ab3',
|
||
'duration': 1160,
|
||
'heatmap': 'count:100',
|
||
'like_count': int,
|
||
'live_status': 'not_live',
|
||
'media_type': 'video',
|
||
'playable_in_embed': True,
|
||
'tags': 'count:17',
|
||
'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
|
||
'timestamp': 1363263144,
|
||
'upload_date': '20130314',
|
||
'uploader': 'PROстое3D',
|
||
'uploader_id': '@PROstoe3D',
|
||
'uploader_url': 'https://www.youtube.com/@PROstoe3D',
|
||
'view_count': int,
|
||
},
|
||
'add_ie': ['Youtube'],
|
||
}, {
|
||
# Redirect
|
||
# https://github.com/ytdl-org/youtube-dl/issues/413
|
||
'url': 'https://www.google.com/url?rct=j&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY',
|
||
'info_dict': {
|
||
'id': 'cmQHVoWB5FY',
|
||
'ext': 'mp4',
|
||
'title': 'First Firefox OS phones side-by-side',
|
||
'age_limit': 0,
|
||
'availability': 'public',
|
||
'categories': ['Entertainment'],
|
||
'channel': 'The Verge',
|
||
'channel_follower_count': int,
|
||
'channel_id': 'UCddiUEpeqJcYeBxX1IVBKvQ',
|
||
'channel_is_verified': True,
|
||
'channel_url': 'https://www.youtube.com/channel/UCddiUEpeqJcYeBxX1IVBKvQ',
|
||
'comment_count': int,
|
||
'description': 'md5:7a676046ad24d9ea55cdde4a6657c5b3',
|
||
'duration': 207,
|
||
'like_count': int,
|
||
'live_status': 'not_live',
|
||
'media_type': 'video',
|
||
'playable_in_embed': True,
|
||
'tags': 'count:15',
|
||
'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
|
||
'timestamp': 1361738430,
|
||
'upload_date': '20130224',
|
||
'uploader': 'The Verge',
|
||
'uploader_id': '@TheVerge',
|
||
'uploader_url': 'https://www.youtube.com/@TheVerge',
|
||
'view_count': int,
|
||
},
|
||
'add_ie': ['Youtube'],
|
||
}]
|
||
|
||
def report_following_redirect(self, new_url):
|
||
"""Report information extraction."""
|
||
self._downloader.to_screen(f'[redirect] Following redirect to {new_url}')
|
||
|
||
def report_detected(self, name, num=1, note=None):
|
||
if num > 1:
|
||
name += 's'
|
||
elif not num:
|
||
return
|
||
else:
|
||
num = 'a'
|
||
|
||
self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
|
||
|
||
def _extra_manifest_info(self, info, manifest_url):
|
||
fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0]
|
||
if fragment_query is not None:
|
||
info['extra_param_to_segment_url'] = (
|
||
urllib.parse.urlparse(fragment_query).query or fragment_query
|
||
or urllib.parse.urlparse(manifest_url).query or None)
|
||
|
||
key_query = self._configuration_arg('key_query', [None], casesense=True)[0]
|
||
if key_query is not None:
|
||
info['extra_param_to_key_url'] = (
|
||
urllib.parse.urlparse(key_query).query or key_query
|
||
or urllib.parse.urlparse(manifest_url).query or None)
|
||
|
||
def hex_or_none(value):
|
||
return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None
|
||
|
||
info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
|
||
'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
|
||
}) or None
|
||
|
||
variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0]
|
||
if variant_query is not None:
|
||
query = urllib.parse.parse_qs(
|
||
urllib.parse.urlparse(variant_query).query or variant_query
|
||
or urllib.parse.urlparse(manifest_url).query)
|
||
for fmt in self._downloader._get_formats(info):
|
||
fmt['url'] = update_url_query(fmt['url'], query)
|
||
|
||
# Attempt to detect live HLS or set VOD duration
|
||
m3u8_format = next((f for f in self._downloader._get_formats(info)
|
||
if determine_protocol(f) == 'm3u8_native'), None)
|
||
if m3u8_format:
|
||
is_live = self._configuration_arg('is_live', [None])[0]
|
||
if is_live is not None:
|
||
info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
|
||
return
|
||
headers = m3u8_format.get('http_headers') or info.get('http_headers') or {}
|
||
display_id = info.get('id')
|
||
urlh = self._request_webpage(
|
||
m3u8_format['url'], display_id, 'Checking m3u8 live status', errnote=False,
|
||
headers={**headers, 'Accept-Encoding': 'identity'}, fatal=False)
|
||
if urlh is False:
|
||
return
|
||
first_bytes = urlh.read(512)
|
||
if not first_bytes.startswith(b'#EXTM3U'):
|
||
return
|
||
m3u8_doc = self._webpage_read_content(
|
||
urlh, urlh.url, display_id, prefix=first_bytes, fatal=False, errnote=False)
|
||
if not m3u8_doc:
|
||
return
|
||
duration = self._parse_m3u8_vod_duration(m3u8_doc, display_id)
|
||
if not duration:
|
||
info['live_status'] = 'is_live'
|
||
info['duration'] = info.get('duration') or duration
|
||
|
||
def _extract_rss(self, url, video_id, doc):
|
||
NS_MAP = {
|
||
'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
|
||
}
|
||
|
||
entries = []
|
||
for it in doc.findall('./channel/item'):
|
||
next_url = next(
|
||
(e.attrib.get('url') for e in it.findall('./enclosure')),
|
||
xpath_text(it, 'link', fatal=False))
|
||
if not next_url:
|
||
continue
|
||
|
||
guid = try_call(lambda: it.find('guid').text)
|
||
if guid:
|
||
next_url = smuggle_url(next_url, {'force_videoid': guid})
|
||
|
||
def itunes(key):
|
||
return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None)
|
||
|
||
entries.append({
|
||
'_type': 'url_transparent',
|
||
'url': next_url,
|
||
'title': try_call(lambda: it.find('title').text),
|
||
'description': xpath_text(it, 'description', default=None),
|
||
'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)),
|
||
'duration': parse_duration(itunes('duration')),
|
||
'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')),
|
||
'episode': itunes('title'),
|
||
'episode_number': int_or_none(itunes('episode')),
|
||
'season_number': int_or_none(itunes('season')),
|
||
'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()),
|
||
})
|
||
|
||
return {
|
||
'_type': 'playlist',
|
||
'id': url,
|
||
'title': try_call(lambda: doc.find('./channel/title').text),
|
||
'description': try_call(lambda: doc.find('./channel/description').text),
|
||
'entries': entries,
|
||
}
|
||
|
||
@classmethod
|
||
def _kvs_get_real_url(cls, video_url, license_code):
|
||
if not video_url.startswith('function/0/'):
|
||
return video_url # not obfuscated
|
||
|
||
parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
|
||
license_token = cls._kvs_get_license_token(license_code)
|
||
urlparts = parsed.path.split('/')
|
||
|
||
HASH_LENGTH = 32
|
||
hash_ = urlparts[3][:HASH_LENGTH]
|
||
indices = list(range(HASH_LENGTH))
|
||
|
||
# Swap indices of hash according to the destination calculated from the license token
|
||
accum = 0
|
||
for src in reversed(range(HASH_LENGTH)):
|
||
accum += license_token[src]
|
||
dest = (src + accum) % HASH_LENGTH
|
||
indices[src], indices[dest] = indices[dest], indices[src]
|
||
|
||
urlparts[3] = ''.join(hash_[index] for index in indices) + urlparts[3][HASH_LENGTH:]
|
||
return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))
|
||
|
||
@staticmethod
|
||
def _kvs_get_license_token(license_code):
|
||
license_code = license_code.replace('$', '')
|
||
license_values = [int(char) for char in license_code]
|
||
|
||
modlicense = license_code.replace('0', '1')
|
||
center = len(modlicense) // 2
|
||
fronthalf = int(modlicense[:center + 1])
|
||
backhalf = int(modlicense[center:])
|
||
modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1]
|
||
|
||
return [
|
||
(license_values[index + offset] + current) % 10
|
||
for index, current in enumerate(map(int, modlicense))
|
||
for offset in range(4)
|
||
]
|
||
|
||
def _extract_kvs(self, url, webpage, video_id):
|
||
flashvars = self._search_json(
|
||
r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)',
|
||
webpage, 'flashvars', video_id, transform_source=js_to_json)
|
||
|
||
# extract the part after the last / as the display_id from the
|
||
# canonical URL.
|
||
display_id = self._search_regex(
|
||
r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
|
||
r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
|
||
webpage, 'display_id', fatal=False)
|
||
title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
|
||
|
||
thumbnail = flashvars['preview_url']
|
||
if thumbnail.startswith('//'):
|
||
protocol, _, _ = url.partition('/')
|
||
thumbnail = protocol + thumbnail
|
||
|
||
url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
|
||
formats = []
|
||
for key in url_keys:
|
||
if '/get_file/' not in flashvars[key]:
|
||
continue
|
||
format_id = flashvars.get(f'{key}_text', key)
|
||
formats.append({
|
||
'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])),
|
||
'format_id': format_id,
|
||
'ext': 'mp4',
|
||
**(parse_resolution(format_id) or parse_resolution(flashvars[key])),
|
||
'http_headers': {'Referer': url},
|
||
})
|
||
if not formats[-1].get('height'):
|
||
formats[-1]['quality'] = 1
|
||
|
||
return {
|
||
'id': flashvars['video_id'],
|
||
'display_id': display_id,
|
||
'title': title,
|
||
'thumbnail': urljoin(url, thumbnail),
|
||
'formats': formats,
|
||
}
|
||
|
||
def _real_extract(self, url):
|
||
if url.startswith('//'):
|
||
return self.url_result(self.http_scheme() + url)
|
||
|
||
parsed_url = urllib.parse.urlparse(url)
|
||
if not parsed_url.scheme:
|
||
default_search = self.get_param('default_search')
|
||
if default_search is None:
|
||
default_search = 'fixup_error'
|
||
|
||
if default_search in ('auto', 'auto_warning', 'fixup_error'):
|
||
if re.match(r'[^\s/]+\.[^\s/]+/', url):
|
||
self.report_warning('The url doesn\'t specify the protocol, trying with http')
|
||
return self.url_result('http://' + url)
|
||
elif default_search != 'fixup_error':
|
||
if default_search == 'auto_warning':
|
||
if re.match(r'^(?:url|URL)$', url):
|
||
raise ExtractorError(
|
||
f'Invalid URL: {url!r} . Call yt-dlp like this: yt-dlp -v "https://www.youtube.com/watch?v=BaW_jenozKc" ',
|
||
expected=True)
|
||
else:
|
||
self.report_warning(
|
||
f'Falling back to youtube search for {url} . Set --default-search "auto" to suppress this warning.')
|
||
return self.url_result('ytsearch:' + url)
|
||
|
||
if default_search in ('error', 'fixup_error'):
|
||
raise ExtractorError(
|
||
f'{url!r} is not a valid URL. '
|
||
f'Set --default-search "ytsearch" (or run yt-dlp "ytsearch:{url}" ) to search YouTube', expected=True)
|
||
else:
|
||
if ':' not in default_search:
|
||
default_search += ':'
|
||
return self.url_result(default_search + url)
|
||
|
||
original_url = url
|
||
url, smuggled_data = unsmuggle_url(url, {})
|
||
force_videoid = None
|
||
is_intentional = smuggled_data.get('to_generic')
|
||
if 'force_videoid' in smuggled_data:
|
||
force_videoid = smuggled_data['force_videoid']
|
||
video_id = force_videoid
|
||
else:
|
||
video_id = self._generic_id(url)
|
||
|
||
# Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335
|
||
impersonate = self._configuration_arg('impersonate', ['false'])
|
||
if 'false' in impersonate:
|
||
impersonate = None
|
||
|
||
# Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
|
||
# making it impossible to download only chunk of the file (yet we need only 512kB to
|
||
# test whether it's HTML or not). According to yt-dlp default Accept-Encoding
|
||
# that will always result in downloading the whole file that is not desirable.
|
||
# Therefore for extraction pass we have to override Accept-Encoding to any in order
|
||
# to accept raw bytes and being able to download only a chunk.
|
||
# It may probably better to solve this by checking Content-Type for application/octet-stream
|
||
# after a HEAD request, but not sure if we can rely on this.
|
||
try:
|
||
full_response = self._request_webpage(url, video_id, headers=filter_dict({
|
||
'Accept-Encoding': 'identity',
|
||
'Referer': smuggled_data.get('referer'),
|
||
}), impersonate=impersonate)
|
||
except ExtractorError as e:
|
||
if not (isinstance(e.cause, HTTPError) and e.cause.status == 403
|
||
and e.cause.response.get_header('cf-mitigated') == 'challenge'
|
||
and e.cause.response.extensions.get('impersonate') is None):
|
||
raise
|
||
cf_cookie_domain = traverse_obj(
|
||
LenientSimpleCookie(e.cause.response.get_header('set-cookie')),
|
||
('__cf_bm', 'domain'))
|
||
if cf_cookie_domain:
|
||
self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}')
|
||
self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm')
|
||
msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; '
|
||
if not self._downloader._impersonate_target_available(ImpersonateTarget()):
|
||
msg += ('see https://github.com/yt-dlp/yt-dlp#impersonation for '
|
||
'how to install the required impersonation dependency, and ')
|
||
raise ExtractorError(
|
||
f'{msg}try again with --extractor-args "generic:impersonate"', expected=True)
|
||
|
||
new_url = full_response.url
|
||
if new_url != extract_basic_auth(url)[0]:
|
||
self.report_following_redirect(new_url)
|
||
if force_videoid:
|
||
new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
|
||
return self.url_result(new_url)
|
||
|
||
info_dict = {
|
||
'id': video_id,
|
||
'title': self._generic_title(url),
|
||
'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')),
|
||
}
|
||
|
||
# Check for direct link to a video
|
||
content_type = full_response.headers.get('Content-Type', '').lower()
|
||
m = re.match(r'(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
|
||
if m:
|
||
self.report_detected('direct video link')
|
||
headers = filter_dict({'Referer': smuggled_data.get('referer')})
|
||
format_id = str(m.group('format_id'))
|
||
ext = determine_ext(url, default_ext=None) or urlhandle_detect_ext(full_response)
|
||
subtitles = {}
|
||
if format_id.endswith('mpegurl') or ext == 'm3u8':
|
||
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
|
||
elif format_id == 'f4m' or ext == 'f4m':
|
||
formats = self._extract_f4m_formats(url, video_id, headers=headers)
|
||
# Don't check for DASH/mpd here, do it later w/ first_bytes. Same number of requests either way
|
||
else:
|
||
formats = [{
|
||
'format_id': format_id,
|
||
'url': url,
|
||
'ext': ext,
|
||
'vcodec': 'none' if m.group('type') == 'audio' else None,
|
||
}]
|
||
info_dict['direct'] = True
|
||
info_dict.update({
|
||
'formats': formats,
|
||
'subtitles': subtitles,
|
||
'http_headers': headers or None,
|
||
})
|
||
self._extra_manifest_info(info_dict, url)
|
||
return info_dict
|
||
|
||
if not self.get_param('test', False) and not is_intentional:
|
||
force = self.get_param('force_generic_extractor', False)
|
||
self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))
|
||
|
||
first_bytes = full_response.read(512)
|
||
|
||
# Is it an M3U playlist?
|
||
if first_bytes.startswith(b'#EXTM3U'):
|
||
self.report_detected('M3U playlist')
|
||
info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
|
||
self._extra_manifest_info(info_dict, url)
|
||
return info_dict
|
||
|
||
# Maybe it's a direct link to a video?
|
||
# Be careful not to download the whole thing!
|
||
if not is_html(first_bytes):
|
||
self.report_warning(
|
||
'URL could be a direct video link, returning it as such.')
|
||
ext = determine_ext(url)
|
||
if ext not in _UnsafeExtensionError.ALLOWED_EXTENSIONS:
|
||
ext = 'unknown_video'
|
||
info_dict.update({
|
||
'direct': True,
|
||
'url': url,
|
||
'ext': ext,
|
||
})
|
||
return info_dict
|
||
|
||
webpage = self._webpage_read_content(
|
||
full_response, url, video_id, prefix=first_bytes)
|
||
|
||
if '<title>DPG Media Privacy Gate</title>' in webpage:
|
||
webpage = self._download_webpage(url, video_id)
|
||
|
||
self.report_extraction(video_id)
|
||
|
||
# Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
|
||
try:
|
||
try:
|
||
doc = compat_etree_fromstring(webpage)
|
||
except xml.etree.ElementTree.ParseError:
|
||
doc = compat_etree_fromstring(webpage.encode())
|
||
if doc.tag == 'rss':
|
||
self.report_detected('RSS feed')
|
||
return self._extract_rss(url, video_id, doc)
|
||
elif doc.tag == 'SmoothStreamingMedia':
|
||
info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
|
||
self.report_detected('ISM manifest')
|
||
return info_dict
|
||
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
|
||
smil = self._parse_smil(doc, url, video_id)
|
||
self.report_detected('SMIL file')
|
||
return smil
|
||
elif doc.tag == '{http://xspf.org/ns/0/}playlist':
|
||
self.report_detected('XSPF playlist')
|
||
return self.playlist_result(
|
||
self._parse_xspf(
|
||
doc, video_id, xspf_url=url,
|
||
xspf_base_url=new_url),
|
||
video_id)
|
||
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
|
||
info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
|
||
doc,
|
||
# Do not use yt_dlp.utils.base_url here since it will raise on file:// URLs
|
||
mpd_base_url=update_url(new_url, query=None, fragment=None).rpartition('/')[0],
|
||
mpd_url=url)
|
||
info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None
|
||
self._extra_manifest_info(info_dict, url)
|
||
self.report_detected('DASH manifest')
|
||
return info_dict
|
||
elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
|
||
info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
|
||
self.report_detected('F4M manifest')
|
||
return info_dict
|
||
except xml.etree.ElementTree.ParseError:
|
||
pass
|
||
|
||
info_dict.update({
|
||
# it's tempting to parse this further, but you would
|
||
# have to take into account all the variations like
|
||
# Video Title - Site Name
|
||
# Site Name | Video Title
|
||
# Video Title - Tagline | Site Name
|
||
# and so on and so forth; it's just not practical
|
||
'title': self._generic_title('', webpage, default='video'),
|
||
'description': self._og_search_description(webpage, default=None),
|
||
'thumbnail': self._og_search_thumbnail(webpage, default=None),
|
||
'age_limit': self._rta_search(webpage),
|
||
})
|
||
|
||
self._downloader.write_debug('Looking for embeds')
|
||
embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
|
||
if len(embeds) == 1:
|
||
return merge_dicts(embeds[0], info_dict)
|
||
elif embeds:
|
||
return self.playlist_result(embeds, **info_dict)
|
||
raise UnsupportedError(url)
|
||
|
||
def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
|
||
"""Returns an iterator of video entries"""
|
||
info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation
|
||
video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
|
||
url, smuggled_data = unsmuggle_url(url, {})
|
||
actual_url = urlh.url if urlh else url
|
||
|
||
# Sometimes embedded video player is hidden behind percent encoding
|
||
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
|
||
# Unescaping the whole page allows to handle those cases in a generic way
|
||
# FIXME: unescaping the whole page may break URLs, commenting out for now.
|
||
# There probably should be a second run of generic extractor on unescaped webpage.
|
||
# webpage = urllib.parse.unquote(webpage)
|
||
|
||
embeds = []
|
||
for ie in self._downloader._ies.values():
|
||
if ie.ie_key() in smuggled_data.get('block_ies', []):
|
||
continue
|
||
gen = ie.extract_from_webpage(self._downloader, url, webpage)
|
||
current_embeds = []
|
||
try:
|
||
while True:
|
||
current_embeds.append(next(gen))
|
||
except self.StopExtraction:
|
||
self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
|
||
embeds and 'discarding other embeds')
|
||
return current_embeds
|
||
except StopIteration:
|
||
self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
|
||
embeds.extend(current_embeds)
|
||
|
||
if embeds:
|
||
return embeds
|
||
|
||
jwplayer_data = self._find_jwplayer_data(
|
||
webpage, video_id, transform_source=js_to_json)
|
||
if jwplayer_data:
|
||
if isinstance(jwplayer_data.get('playlist'), str):
|
||
self.report_detected('JW Player playlist')
|
||
return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')]
|
||
try:
|
||
info = self._parse_jwplayer_data(
|
||
jwplayer_data, video_id, require_title=False, base_url=url)
|
||
if traverse_obj(info, 'formats', ('entries', ..., 'formats')):
|
||
self.report_detected('JW Player data')
|
||
return [info]
|
||
except ExtractorError:
|
||
# See https://github.com/ytdl-org/youtube-dl/pull/16735
|
||
pass
|
||
|
||
# Video.js embed
|
||
mobj = re.search(
|
||
r'(?s)\bvideojs\s*\(.+?([a-zA-Z0-9_$]+)\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
|
||
webpage)
|
||
if mobj is not None:
|
||
varname = mobj.group(1)
|
||
sources = variadic(self._parse_json(
|
||
mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
|
||
formats, subtitles, src = [], {}, None
|
||
for source in sources:
|
||
src = source.get('src')
|
||
if not src or not isinstance(src, str):
|
||
continue
|
||
src = urllib.parse.urljoin(url, src)
|
||
src_type = source.get('type')
|
||
if isinstance(src_type, str):
|
||
src_type = src_type.lower()
|
||
ext = determine_ext(src).lower()
|
||
if src_type == 'video/youtube':
|
||
return [self.url_result(src, YoutubeIE.ie_key())]
|
||
if src_type == 'application/dash+xml' or ext == 'mpd':
|
||
fmts, subs = self._extract_mpd_formats_and_subtitles(
|
||
src, video_id, mpd_id='dash', fatal=False)
|
||
formats.extend(fmts)
|
||
self._merge_subtitles(subs, target=subtitles)
|
||
elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
|
||
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||
src, video_id, 'mp4', entry_protocol='m3u8_native',
|
||
m3u8_id='hls', fatal=False)
|
||
formats.extend(fmts)
|
||
self._merge_subtitles(subs, target=subtitles)
|
||
|
||
if not formats:
|
||
formats.append({
|
||
'url': src,
|
||
'ext': (mimetype2ext(src_type)
|
||
or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
|
||
'http_headers': {
|
||
'Referer': actual_url,
|
||
},
|
||
})
|
||
# https://docs.videojs.com/player#addRemoteTextTrack
|
||
# https://html.spec.whatwg.org/multipage/media.html#htmltrackelement
|
||
for sub_match in re.finditer(rf'(?s){re.escape(varname)}' + r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
|
||
sub = self._parse_json(
|
||
sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
|
||
sub_src = str_or_none(sub.get('src'))
|
||
if not sub_src:
|
||
continue
|
||
subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
|
||
'url': urllib.parse.urljoin(url, sub_src),
|
||
'name': sub.get('label'),
|
||
'http_headers': {
|
||
'Referer': actual_url,
|
||
},
|
||
})
|
||
if formats or subtitles:
|
||
self.report_detected('video.js embed')
|
||
info_dict = {'formats': formats, 'subtitles': subtitles}
|
||
if formats:
|
||
self._extra_manifest_info(info_dict, src)
|
||
return [info_dict]
|
||
|
||
# Look for generic KVS player (before json-ld bc of some urls that break otherwise)
|
||
found = self._search_regex((
|
||
r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:(?!\1)[^?#])+/kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
|
||
r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:(?!\2)[^?#])+/kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
|
||
), webpage, 'KVS player', group='ver', default=False)
|
||
if found:
|
||
self.report_detected('KVS Player')
|
||
if found.split('.')[0] not in ('4', '5', '6'):
|
||
self.report_warning(f'Untested major version ({found}) in player engine - download may fail.')
|
||
return [self._extract_kvs(url, webpage, video_id)]
|
||
|
||
# Looking for http://schema.org/VideoObject
|
||
json_ld = self._search_json_ld(webpage, video_id, default={})
|
||
if json_ld.get('url') not in (url, None):
|
||
self.report_detected('JSON LD')
|
||
is_direct = json_ld.get('ext') not in (None, *MEDIA_EXTENSIONS.manifests)
|
||
return [merge_dicts({
|
||
'_type': 'video' if is_direct else 'url_transparent',
|
||
'url': smuggle_url(json_ld['url'], {
|
||
'force_videoid': video_id,
|
||
'to_generic': True,
|
||
'referer': url,
|
||
}),
|
||
}, json_ld)]
|
||
|
||
def check_video(vurl):
|
||
if YoutubeIE.suitable(vurl):
|
||
return True
|
||
if RtmpIE.suitable(vurl):
|
||
return True
|
||
vpath = urllib.parse.urlparse(vurl).path
|
||
vext = determine_ext(vpath, None)
|
||
return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
|
||
|
||
def filter_video(urls):
|
||
return list(filter(check_video, urls))
|
||
|
||
# Start with something easy: JW Player in SWFObject
|
||
found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
|
||
if found:
|
||
self.report_detected('JW Player in SFWObject')
|
||
else:
|
||
# Look for gorilla-vid style embedding
|
||
found = filter_video(re.findall(r'''(?sx)
|
||
(?:
|
||
jw_plugins|
|
||
JWPlayerOptions|
|
||
jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
|
||
)
|
||
.*?
|
||
['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
|
||
if found:
|
||
self.report_detected('JW Player embed')
|
||
if not found:
|
||
# Broaden the search a little bit
|
||
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
|
||
if found:
|
||
self.report_detected('video file')
|
||
if not found:
|
||
# Broaden the findall a little bit: JWPlayer JS loader
|
||
found = filter_video(re.findall(
|
||
r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
|
||
if found:
|
||
self.report_detected('JW Player JS loader')
|
||
if not found:
|
||
# Flow player
|
||
found = filter_video(re.findall(r'''(?xs)
|
||
flowplayer\("[^"]+",\s*
|
||
\{[^}]+?\}\s*,
|
||
\s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
|
||
["']?url["']?\s*:\s*["']([^"']+)["']
|
||
''', webpage))
|
||
if found:
|
||
self.report_detected('Flow Player')
|
||
if not found:
|
||
# Cinerama player
|
||
found = re.findall(
|
||
r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
|
||
if found:
|
||
self.report_detected('Cinerama player')
|
||
if not found:
|
||
# Try to find twitter cards info
|
||
# twitter:player:stream should be checked before twitter:player since
|
||
# it is expected to contain a raw stream (see
|
||
# https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
|
||
found = filter_video(re.findall(
|
||
r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
|
||
if found:
|
||
self.report_detected('Twitter card')
|
||
if not found:
|
||
# We look for Open Graph info:
|
||
# We have to match any number spaces between elements, some sites try to align them, e.g.: statigr.am
|
||
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
|
||
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
|
||
if m_video_type is not None:
|
||
found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage))
|
||
if found:
|
||
self.report_detected('Open Graph video info')
|
||
if not found:
|
||
REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
|
||
found = re.search(
|
||
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
|
||
rf'(?:[a-z-]+="[^"]+"\s+)*?content="{REDIRECT_REGEX}',
|
||
webpage)
|
||
if not found:
|
||
# Look also in Refresh HTTP header
|
||
refresh_header = urlh and urlh.headers.get('Refresh')
|
||
if refresh_header:
|
||
found = re.search(REDIRECT_REGEX, refresh_header)
|
||
if found:
|
||
new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1)))
|
||
if new_url != url:
|
||
self.report_following_redirect(new_url)
|
||
return [self.url_result(new_url)]
|
||
else:
|
||
found = None
|
||
|
||
if not found:
|
||
# twitter:player is a https URL to iframe player that may or may not
|
||
# be supported by yt-dlp thus this is checked the very last (see
|
||
# https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
|
||
embed_url = self._html_search_meta('twitter:player', webpage, default=None)
|
||
if embed_url and embed_url != url:
|
||
self.report_detected('twitter:player iframe')
|
||
return [self.url_result(embed_url)]
|
||
|
||
if not found:
|
||
return []
|
||
|
||
domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None)
|
||
|
||
entries = []
|
||
for video_url in orderedSet(found):
|
||
video_url = video_url.encode().decode('unicode-escape')
|
||
video_url = unescapeHTML(video_url)
|
||
video_url = video_url.replace('\\/', '/')
|
||
video_url = urllib.parse.urljoin(url, video_url)
|
||
video_id = urllib.parse.unquote(os.path.basename(video_url))
|
||
|
||
# Sometimes, jwplayer extraction will result in a YouTube URL
|
||
if YoutubeIE.suitable(video_url):
|
||
entries.append(self.url_result(video_url, 'Youtube'))
|
||
continue
|
||
|
||
video_id = os.path.splitext(video_id)[0]
|
||
headers = {
|
||
'referer': actual_url,
|
||
}
|
||
|
||
entry_info_dict = {
|
||
'id': video_id,
|
||
'uploader': domain_name,
|
||
'title': info_dict['title'],
|
||
'age_limit': info_dict['age_limit'],
|
||
'http_headers': headers,
|
||
}
|
||
|
||
if RtmpIE.suitable(video_url):
|
||
entry_info_dict.update({
|
||
'_type': 'url_transparent',
|
||
'ie_key': RtmpIE.ie_key(),
|
||
'url': video_url,
|
||
})
|
||
entries.append(entry_info_dict)
|
||
continue
|
||
|
||
ext = determine_ext(video_url)
|
||
if ext == 'smil':
|
||
entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict}
|
||
elif ext == 'xspf':
|
||
return [self._extract_xspf_playlist(video_url, video_id)]
|
||
elif ext == 'm3u8':
|
||
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
|
||
self._extra_manifest_info(entry_info_dict, video_url)
|
||
elif ext == 'mpd':
|
||
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
|
||
self._extra_manifest_info(entry_info_dict, video_url)
|
||
elif ext == 'f4m':
|
||
entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
|
||
elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
|
||
# Just matching .ism/manifest is not enough to be reliably sure
|
||
# whether it's actually an ISM manifest or some other streaming
|
||
# manifest since there are various streaming URL formats
|
||
# possible (see [1]) as well as some other shenanigans like
|
||
# .smil/manifest URLs that actually serve an ISM (see [2]) and
|
||
# so on.
|
||
# Thus the most reasonable way to solve this is to delegate
|
||
# to generic extractor in order to look into the contents of
|
||
# the manifest itself.
|
||
# 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats
|
||
# 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest
|
||
entry_info_dict = self.url_result(
|
||
smuggle_url(video_url, {'to_generic': True}),
|
||
GenericIE.ie_key())
|
||
else:
|
||
entry_info_dict['url'] = video_url
|
||
|
||
entries.append(entry_info_dict)
|
||
|
||
if len(entries) > 1:
|
||
for num, e in enumerate(entries, start=1):
|
||
# 'url' results don't have a title
|
||
if e.get('title') is not None:
|
||
e['title'] = '{} ({})'.format(e['title'], num)
|
||
return entries
|