Files
yt-dlp/yt_dlp/extractor/generic.py
doe1080 1c6068af99
Some checks failed
CodeQL / Analyze (python) (push) Has been cancelled
Download Tests / Quick Download Tests (push) Has been cancelled
Download Tests / Full Download Tests (ubuntu-latest, 3.10) (push) Has been cancelled
Download Tests / Full Download Tests (ubuntu-latest, 3.11) (push) Has been cancelled
Download Tests / Full Download Tests (ubuntu-latest, 3.12) (push) Has been cancelled
Download Tests / Full Download Tests (ubuntu-latest, 3.13) (push) Has been cancelled
Download Tests / Full Download Tests (ubuntu-latest, pypy-3.11) (push) Has been cancelled
Download Tests / Full Download Tests (windows-latest, 3.9) (push) Has been cancelled
Download Tests / Full Download Tests (windows-latest, pypy-3.11) (push) Has been cancelled
Quick Test / Core Test (push) Has been cancelled
Quick Test / Code check (push) Has been cancelled
Release (master) / release (push) Has been cancelled
Release (master) / publish_pypi (push) Has been cancelled
[cleanup] Move embed tests to dedicated extractors (#13782)
Authored by: doe1080
2025-08-01 20:50:20 +00:00

1316 lines
58 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
import types
import urllib.parse
import xml.etree.ElementTree
from .common import InfoExtractor
from .commonprotocols import RtmpIE
from .youtube import YoutubeIE
from ..compat import compat_etree_fromstring
from ..cookies import LenientSimpleCookie
from ..networking.exceptions import HTTPError
from ..networking.impersonate import ImpersonateTarget
from ..utils import (
KNOWN_EXTENSIONS,
MEDIA_EXTENSIONS,
ExtractorError,
UnsupportedError,
determine_ext,
determine_protocol,
dict_get,
extract_basic_auth,
filter_dict,
format_field,
int_or_none,
is_html,
js_to_json,
merge_dicts,
mimetype2ext,
orderedSet,
parse_duration,
parse_resolution,
smuggle_url,
str_or_none,
traverse_obj,
try_call,
unescapeHTML,
unified_timestamp,
unsmuggle_url,
update_url,
update_url_query,
url_or_none,
urlhandle_detect_ext,
urljoin,
variadic,
xpath_attr,
xpath_text,
xpath_with_ns,
)
from ..utils._utils import _UnsafeExtensionError
class GenericIE(InfoExtractor):
IE_DESC = 'Generic downloader that works on some sites'
_VALID_URL = r'.*'
IE_NAME = 'generic'
_NETRC_MACHINE = False # Suppress username warning
_TESTS = [{
# Direct link
# https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
'url': 'https://media.w3.org/2010/05/sintel/trailer.mp4',
'md5': '67d406c2bcb6af27fa886f31aa934bbe',
'info_dict': {
'id': 'trailer',
'ext': 'mp4',
'title': 'trailer',
'direct': True,
'timestamp': 1273772943,
'upload_date': '20100513',
},
}, {
# Direct link: No HEAD support
# https://github.com/ytdl-org/youtube-dl/issues/4032
'url': 'http://ai-radio.org:8000/radio.opus',
'info_dict': {
'id': 'radio',
'ext': 'opus',
'title': 'radio',
},
'skip': 'Invalid URL',
}, {
# Direct link: Incorrect MIME type
# https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
'url': 'https://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
'md5': '4ccbebe5f36706d85221f204d7eb5913',
'info_dict': {
'id': '5_Lennart_Poettering_-_Systemd',
'ext': 'webm',
'title': '5_Lennart_Poettering_-_Systemd',
'direct': True,
'timestamp': 1416498816,
'upload_date': '20141120',
},
}, {
# Direct link: Live HLS; https://castr.com/hlsplayer/
# https://github.com/yt-dlp/yt-dlp/pull/6775
'url': 'https://stream-akamai.castr.com/5b9352dbda7b8c769937e459/live_2361c920455111ea85db6911fe397b9e/index.fmp4.m3u8',
'info_dict': {
'id': 'index.fmp4',
'ext': 'mp4',
'title': str,
'live_status': 'is_live',
},
'params': {'skip_download': 'm3u8'},
}, {
# Compressed when `Accept-Encoding: *`
# https://github.com/ytdl-org/youtube-dl/commit/a074e922967fa571d4f1abb1773c711747060f00
'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
'info_dict': {
'id': 'FictionJunction-Parallel_Hearts',
'ext': 'flac',
'title': 'FictionJunction-Parallel_Hearts',
},
'skip': 'Invalid URL',
}, {
# `Content-Encoding: br` when `Accept-Encoding: *`
# https://github.com/yt-dlp/yt-dlp/commit/3e01ce744a981d8f19ae77ec695005e7000f4703
'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
'md5': 'a9a2cad3e54f78e4680c6deef82417e9',
'info_dict': {
'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
'ext': 'mp4',
'title': 'čauky lidi 70 finall',
'age_limit': 0,
'description': 'md5:47b2673a5b76780d9d329783e1fbf5aa',
'direct': True,
'duration': 318.0,
'thumbnail': r're:https?://media\.extra\.cz/static/img/.+\.jpg',
'timestamp': 1654513791,
'upload_date': '20220606',
},
'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
}, {
# HLS: `Content-Type: audio/mpegurl`; https://bitmovin.com/demos/stream-test
# https://github.com/ytdl-org/youtube-dl/commit/20938f768b16c945c6041ba3c0a7ae1a4e790881
'url': 'https://cdn.bitmovin.com/content/assets/art-of-motion-dash-hls-progressive/m3u8s/f08e80da-bf1d-4e3d-8899-f0f6155f6efa.m3u8',
'info_dict': {
'id': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
'ext': 'mp4',
'title': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
'duration': 211,
'timestamp': 1737363648,
'upload_date': '20250120',
},
'params': {'skip_download': 'm3u8'},
}, {
# HLS: `Content-Type: text/plain`; https://github.com/grafov/m3u8
# https://github.com/ytdl-org/youtube-dl/commit/edd9b71c2cca7e5a0df8799710d9ad410ec77d29
'url': 'https://raw.githubusercontent.com/grafov/m3u8/refs/heads/master/sample-playlists/master.m3u8',
'info_dict': {
'id': 'master',
'ext': 'mp4',
'title': 'master',
},
'params': {'skip_download': 'm3u8'},
}, {
# MPEG-DASH; https://bitmovin.com/demos/stream-test
# https://github.com/ytdl-org/youtube-dl/commit/9d939cec48f06a401fb79eb078c1fc50b2aefbe1
'url': 'https://cdn.bitmovin.com/content/assets/art-of-motion-dash-hls-progressive/mpds/f08e80da-bf1d-4e3d-8899-f0f6155f6efa.mpd',
'info_dict': {
'id': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
'ext': 'mp4',
'title': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
'timestamp': 1737363728,
'upload_date': '20250120',
},
'params': {'skip_download': True},
}, {
# Live MPEG-DASH; https://livesim2.dashif.org/urlgen/create
# https://github.com/yt-dlp/yt-dlp/pull/12256
'url': 'https://livesim2.dashif.org/livesim2/ato_10/testpic_2s/Manifest.mpd',
'info_dict': {
'id': 'Manifest',
'ext': 'mp4',
'title': str,
'live_status': 'is_live',
},
'params': {'skip_download': 'livestream'},
}, {
# SMIL
# https://github.com/ytdl-org/youtube-dl/pull/6428
'url': 'https://api.new.livestream.com/accounts/21/events/7954027/videos/166558123.secure.smil',
'info_dict': {
'id': '166558123.secure',
'ext': 'mp4',
'title': '73fb2379-a624-4b6c-bce4-e46086007f2c',
},
'params': {'skip_download': 'smil'},
}, {
# XSPF playlist; https://shellac-archive.ch/de/index.html
# https://github.com/ytdl-org/youtube-dl/commit/1de5cd3ba51ce67d9a1cd3b40157058e78e46692
'url': 'https://shellac-archive.ch/repository/xspf/22-AL0019Z.xspf',
'info_dict': {
'id': '22-AL0019Z',
},
'playlist_count': 12,
'params': {'skip_download': True},
}, {
# RSS feed
# https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
'info_dict': {
'id': 'https://phihag.de/2014/youtube-dl/rss2.xml',
'title': 'Zero Punctuation',
'description': 'md5:512ae5f840e52eb3c0d08d4bed08eb3e',
},
'playlist_mincount': 11,
}, {
# RSS feed: Includes enclosure, description, and thumbnails
# https://github.com/ytdl-org/youtube-dl/pull/27405
'url': 'https://anchor.fm/s/dd00e14/podcast/rss',
'info_dict': {
'id': 'https://anchor.fm/s/dd00e14/podcast/rss',
'title': '100% Hydrogen ',
'description': 'md5:7ec96327f8b91a2549a2e74f064022a1',
},
'playlist_count': 1,
'params': {'skip_download': True},
}, {
# RSS feed: Includes guid
'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
'info_dict': {
'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
'title': 'The Little Red Podcast',
'description': 'md5:be809a44b63b0c56fb485caf68685520',
},
'playlist_mincount': 76,
}, {
# RSS feed: Includes enclosure and unsupported URLs
# https://github.com/ytdl-org/youtube-dl/pull/16189
'url': 'https://www.interfax.ru/rss.asp',
'info_dict': {
'id': 'https://www.interfax.ru/rss.asp',
'title': 'Интерфакс',
'description': 'md5:49b6b8905772efba21923942bbc0444c',
},
'playlist_mincount': 25,
}, {
# Webpage starts with a duplicate UTF-8 BOM
# https://github.com/yt-dlp/yt-dlp/commit/80e8493ee7c3083f4e215794e4a67ba5265f24f7
'url': 'https://www.filmarkivet.se/movies/paris-d-moll/',
'md5': 'df02cadc719dcc63d43288366f037754',
'info_dict': {
'id': 'paris-d-moll',
'ext': 'mp4',
'title': 'Paris d-moll',
'age_limit': 0,
'description': 'md5:319e37ea5542293db37e1e13072fe330',
'thumbnail': r're:https?://www\.filmarkivet\.se/wp-content/uploads/.+\.jpg',
},
}, {
# Multiple HTML5 videos
# https://github.com/ytdl-org/youtube-dl/pull/14107
'url': 'https://www.dagbladet.no/nyheter/etter-ett-ars-planlegging-klaffet-endelig-alt---jeg-matte-ta-en-liten-dans/60413035',
'info_dict': {
'id': '60413035',
'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans',
'age_limit': 0,
'description': 'md5:bbb4e12e42e78609a74fd421b93b1239',
'thumbnail': r're:https?://www\.dagbladet\.no/images/.+',
},
'playlist_count': 2,
}, {
# Cinerama Player
# https://github.com/ytdl-org/youtube-dl/commit/501f13fbf3d1f7225f91e3e0ad008df2cd3219f1
'url': 'https://www.abc.net.au/res/libraries/cinerama2/examples/single_clip.htm',
'info_dict': {
'id': 'single_clip',
'title': 'Single Clip player examples',
'age_limit': 0,
},
'playlist_count': 3,
}, {
# FIXME: Improve extraction
# Flowplayer
# https://github.com/ytdl-org/youtube-dl/commit/4d805e063c6c4ffd557d7c7cb905a3ed9c926b08
'url': 'https://flowplayer.com/resources/demos/standard-setup',
'info_dict': {
'id': 'playlist',
'ext': 'mp4',
'title': 'playlist',
'duration': 13,
'timestamp': 1539082175,
'upload_date': '20181009',
},
'params': {'skip_download': 'm3u8'},
}, {
# JW Player: YouTube
# https://github.com/ytdl-org/youtube-dl/commit/a0f719854463c6f4226e4042dfa80c1b17154e1d
'url': 'https://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
'info_dict': {
'id': 'Mrj4DVp2zeA',
'ext': 'mp4',
'title': 'Using Discovery, The National Archives online catalogue',
'age_limit': 0,
'availability': 'unlisted',
'categories': ['Education'],
'channel': 'The National Archives UK',
'channel_follower_count': int,
'channel_id': 'UCUuzebc1yADDJEnOLA5P9xw',
'channel_url': 'https://www.youtube.com/channel/UCUuzebc1yADDJEnOLA5P9xw',
'chapters': 'count:13',
'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
'duration': 3066,
'like_count': int,
'live_status': 'not_live',
'media_type': 'video',
'playable_in_embed': True,
'tags': 'count:5',
'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
'timestamp': 1423757117,
'upload_date': '20150212',
'uploader': 'The National Archives UK',
'uploader_id': '@TheNationalArchivesUK',
'uploader_url': 'https://www.youtube.com/@TheNationalArchivesUK',
'view_count': int,
},
'add_ie': ['Youtube'],
}, {
# JW Player: Complex
# https://github.com/ytdl-org/youtube-dl/commit/a4a554a79354981fcab55de8eaab7b95a40bbb48
'url': 'https://www.indiedb.com/games/king-machine/videos',
'info_dict': {
'id': 'videos-1',
'ext': 'mp4',
'title': 'Videos & Audio - King Machine (1)',
'age_limit': 0,
'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.',
'thumbnail': r're:https?://media\.indiedb\.com/cache/images/.+\.jpg',
'_old_archive_ids': ['generic videos'],
},
}, {
# JW Player: JSON Feed URL
# https://github.com/yt-dlp/yt-dlp/issues/1476
'url': 'https://foodschmooze.org/',
'info_dict': {
'id': 'z00Frhnw',
'ext': 'mp4',
'title': 'Grilling Beef Tenderloin',
'description': '',
'duration': 392.0,
'thumbnail': r're:https?://cdn\.jwplayer\.com/v2/media/.+',
'timestamp': 1465313685,
'upload_date': '20160607',
},
'params': {'skip_download': 'm3u8'},
}, {
# JW Player: RTMP
# https://github.com/ytdl-org/youtube-dl/issues/11993
'url': 'http://www.suffolk.edu/sjc/live.php',
'info_dict': {
'id': 'live',
'ext': 'flv',
'title': 'Massachusetts Supreme Judicial Court Oral Arguments',
},
'skip': 'Invalid URL',
}, {
# KVS Player v7.3.3
# kt_player.js?v=5.1.1
'url': 'https://bogmedia.org/videos/21217/40-nochey-2016/',
'md5': '94166bdb26b4cb1fb9214319a629fc51',
'info_dict': {
'id': '21217',
'ext': 'mp4',
'title': '40 ночей (2016) - BogMedia.org',
'age_limit': 0,
'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
'display_id': '40-nochey-2016',
'thumbnail': r're:https?://bogmedia\.org/contents/videos_screenshots/.+\.jpg',
},
}, {
# KVS Player v7.7.11
# kt_player.js?v=5.5.1
# https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
'url': 'https://youix.com/video/leningrad-zoj/',
'md5': '94f96ba95706dc3880812b27b7d8a2b8',
'info_dict': {
'id': '18485',
'ext': 'mp4',
'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
'age_limit': 0,
'display_id': 'leningrad-zoj',
'thumbnail': r're:https?://youix\.com/contents/videos_screenshots/.+\.jpg',
},
}, {
# KVS Player v7.10.3
# kt_player.js?v=12
# https://github.com/ytdl-org/youtube-dl/commit/fc2beab0e701c497a003f11fef5c0df54fba1da3
'url': 'https://shooshtime.com/videos/346037/fresh-out-of-the-shower/',
'md5': 'c9a97ad528607a4516d4df83a3aeb12c',
'info_dict': {
'id': '346037',
'ext': 'mp4',
'title': 'Fresh out of the shower - Shooshtime',
'age_limit': 18,
'description': 'md5:efd70fd3973f8750d285c743b910580a',
'display_id': 'fresh-out-of-the-shower',
'thumbnail': r're:https?://i\.shoosh\.co/contents/videos_screenshots/.+\.jpg',
},
'expected_warnings': ['Untested major version'],
}, {
# FIXME: Unable to extract flashvars
# KVS Player v7.11.4
# kt_player.js?v=2.11.5.1
# https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
'url': 'https://www.kvs-demo.com/video/105/kelis-4th-of-july/',
'info_dict': {
'id': '105',
'ext': 'mp4',
'title': 'Kelis - 4th Of July',
},
}, {
# KVS Player v7.11.4
# kt_player.js?v=6.3.2
# https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
'url': 'https://www.kvs-demo.com/embed/105/',
'md5': '1ff84c70acaddbb03288c6cc5ee1879f',
'info_dict': {
'id': '105',
'ext': 'mp4',
'title': 'Kelis - 4th Of July / Embed Player',
'age_limit': 0,
'display_id': 'kelis-4th-of-july',
'thumbnail': r're:https?://www\.kvs-demo\.com/contents/videos_screenshots/.+\.jpg',
},
}, {
# twitter:player:stream
# https://github.com/ytdl-org/youtube-dl/commit/371ddb14fe651d4a1e5a8310d6d7c0e395cd92b0
'url': 'https://beltzlaw.com/',
'info_dict': {
'id': 'beltzlaw-1',
'ext': 'mp4',
'title': 'Beltz Law Group | Dallas Traffic Ticket, Accident & Criminal Attorney (1)',
'age_limit': 0,
'description': 'md5:5bdf23fcb76801dc3b31e74cabf82147',
'thumbnail': r're:https?://beltzlaw\.com/wp-content/uploads/.+\.jpg',
'timestamp': int, # varies
'upload_date': str,
'_old_archive_ids': ['generic beltzlaw'],
},
}, {
# twitter:player
# https://github.com/ytdl-org/youtube-dl/commit/329179073b93e37ab76e759d1fe96d8f984367f3
'url': 'https://cine.ar/',
'md5': 'd3e33335e339f04008690118698dfd08',
'info_dict': {
'id': 'cine-1',
'ext': 'webm',
'title': 'CINE.AR (1)',
'age_limit': 0,
'description': 'md5:a4e58f9e2291c940e485f34251898c4a',
'thumbnail': r're:https?://cine\.ar/img/.+\.png',
'_old_archive_ids': ['generic cine'],
},
'params': {'format': 'webm'},
}, {
# JSON-LD: multiple @type
# https://github.com/yt-dlp/yt-dlp/commit/f3c0c77304bc0e5614a65c45629de22f067685ac
'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
'info_dict': {
'id': 'ipy2AcGL',
'ext': 'mp4',
'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen',
'age_limit': 0,
'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d',
'duration': 111.0,
'thumbnail': r're:https?://images\.nu\.nl/.+\.jpg',
'timestamp': 1586584674,
'upload_date': '20200411',
},
'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
}, {
# JSON-LD: unexpected @type
# https://github.com/yt-dlp/yt-dlp/pull/5145
'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/',
'info_dict': {
'id': 'porsche-911-gt3-rs-rij-impressie-2',
'ext': 'mp4',
'title': 'Test: Porsche 911 GT3 RS - AutoWeek',
'age_limit': 0,
'description': 'md5:a17b5bd84288448d8f11b838505718fc',
'direct': True,
'thumbnail': r're:https?://images\.autoweek\.nl/.+',
'timestamp': 1664920902,
'upload_date': '20221004',
},
'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
}, {
# JSON-LD: VideoObject
# https://github.com/ytdl-org/youtube-dl/commit/6e6b70d65f0681317c425bfe1e157f3474afbbe8
'url': 'https://breezy.hr/',
'info_dict': {
'id': 'k6gl2kt2eq',
'ext': 'mp4',
'title': 'Breezy HR\'s ATS helps you find & hire employees sooner',
'age_limit': 0,
'average_rating': 4.5,
'description': 'md5:eee75fdd3044c538003f3be327ba01e1',
'duration': 60.1,
'thumbnail': r're:https?://cdn\.prod\.website-files\.com/.+\.webp',
'timestamp': 1485734400,
'upload_date': '20170130',
},
}, {
# Video.js: VOD HLS
# https://github.com/yt-dlp/yt-dlp/pull/6775
'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html',
'info_dict': {
'id': 'videojs_hls_test',
'ext': 'mp4',
'title': 'video',
'age_limit': 0,
'duration': 1800,
},
'params': {'skip_download': 'm3u8'},
}, {
# Video.js: YouTube
# https://github.com/ytdl-org/youtube-dl/commit/63d990d2859d0e981da2e416097655798334431b
'url': 'https://ortcam.com/solidworks-%d1%83%d1%80%d0%be%d0%ba-6-%d0%bd%d0%b0%d1%81%d1%82%d1%80%d0%be%d0%b9%d0%ba%d0%b0-%d1%87%d0%b5%d1%80%d1%82%d0%b5%d0%b6%d0%b0_33f9b7351.html?vid=33f9b7351',
'info_dict': {
'id': 'yygqldloqIk',
'ext': 'mp4',
'title': 'SolidWorks. Урок 6 Настройка чертежа',
'age_limit': 0,
'availability': 'public',
'categories': ['Education'],
'channel': 'PROстое3D',
'channel_follower_count': int,
'channel_id': 'UCy91Bug3dERhbwGh2m2Ijng',
'channel_url': 'https://www.youtube.com/channel/UCy91Bug3dERhbwGh2m2Ijng',
'comment_count': int,
'description': 'md5:baf95267792646afdbf030e4d06b2ab3',
'duration': 1160,
'heatmap': 'count:100',
'like_count': int,
'live_status': 'not_live',
'media_type': 'video',
'playable_in_embed': True,
'tags': 'count:17',
'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
'timestamp': 1363263144,
'upload_date': '20130314',
'uploader': 'PROстое3D',
'uploader_id': '@PROstoe3D',
'uploader_url': 'https://www.youtube.com/@PROstoe3D',
'view_count': int,
},
'add_ie': ['Youtube'],
}, {
# Redirect
# https://github.com/ytdl-org/youtube-dl/issues/413
'url': 'https://www.google.com/url?rct=j&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY',
'info_dict': {
'id': 'cmQHVoWB5FY',
'ext': 'mp4',
'title': 'First Firefox OS phones side-by-side',
'age_limit': 0,
'availability': 'public',
'categories': ['Entertainment'],
'channel': 'The Verge',
'channel_follower_count': int,
'channel_id': 'UCddiUEpeqJcYeBxX1IVBKvQ',
'channel_is_verified': True,
'channel_url': 'https://www.youtube.com/channel/UCddiUEpeqJcYeBxX1IVBKvQ',
'comment_count': int,
'description': 'md5:7a676046ad24d9ea55cdde4a6657c5b3',
'duration': 207,
'like_count': int,
'live_status': 'not_live',
'media_type': 'video',
'playable_in_embed': True,
'tags': 'count:15',
'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
'timestamp': 1361738430,
'upload_date': '20130224',
'uploader': 'The Verge',
'uploader_id': '@TheVerge',
'uploader_url': 'https://www.youtube.com/@TheVerge',
'view_count': int,
},
'add_ie': ['Youtube'],
}]
def report_following_redirect(self, new_url):
"""Report information extraction."""
self._downloader.to_screen(f'[redirect] Following redirect to {new_url}')
def report_detected(self, name, num=1, note=None):
if num > 1:
name += 's'
elif not num:
return
else:
num = 'a'
self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
def _extra_manifest_info(self, info, manifest_url):
fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0]
if fragment_query is not None:
info['extra_param_to_segment_url'] = (
urllib.parse.urlparse(fragment_query).query or fragment_query
or urllib.parse.urlparse(manifest_url).query or None)
key_query = self._configuration_arg('key_query', [None], casesense=True)[0]
if key_query is not None:
info['extra_param_to_key_url'] = (
urllib.parse.urlparse(key_query).query or key_query
or urllib.parse.urlparse(manifest_url).query or None)
def hex_or_none(value):
return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None
info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
}) or None
variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0]
if variant_query is not None:
query = urllib.parse.parse_qs(
urllib.parse.urlparse(variant_query).query or variant_query
or urllib.parse.urlparse(manifest_url).query)
for fmt in self._downloader._get_formats(info):
fmt['url'] = update_url_query(fmt['url'], query)
# Attempt to detect live HLS or set VOD duration
m3u8_format = next((f for f in self._downloader._get_formats(info)
if determine_protocol(f) == 'm3u8_native'), None)
if m3u8_format:
is_live = self._configuration_arg('is_live', [None])[0]
if is_live is not None:
info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
return
headers = m3u8_format.get('http_headers') or info.get('http_headers') or {}
display_id = info.get('id')
urlh = self._request_webpage(
m3u8_format['url'], display_id, 'Checking m3u8 live status', errnote=False,
headers={**headers, 'Accept-Encoding': 'identity'}, fatal=False)
if urlh is False:
return
first_bytes = urlh.read(512)
if not first_bytes.startswith(b'#EXTM3U'):
return
m3u8_doc = self._webpage_read_content(
urlh, urlh.url, display_id, prefix=first_bytes, fatal=False, errnote=False)
if not m3u8_doc:
return
duration = self._parse_m3u8_vod_duration(m3u8_doc, display_id)
if not duration:
info['live_status'] = 'is_live'
info['duration'] = info.get('duration') or duration
def _extract_rss(self, url, video_id, doc):
NS_MAP = {
'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
}
entries = []
for it in doc.findall('./channel/item'):
next_url = next(
(e.attrib.get('url') for e in it.findall('./enclosure')),
xpath_text(it, 'link', fatal=False))
if not next_url:
continue
guid = try_call(lambda: it.find('guid').text)
if guid:
next_url = smuggle_url(next_url, {'force_videoid': guid})
def itunes(key):
return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None)
entries.append({
'_type': 'url_transparent',
'url': next_url,
'title': try_call(lambda: it.find('title').text),
'description': xpath_text(it, 'description', default=None),
'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)),
'duration': parse_duration(itunes('duration')),
'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')),
'episode': itunes('title'),
'episode_number': int_or_none(itunes('episode')),
'season_number': int_or_none(itunes('season')),
'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()),
})
return {
'_type': 'playlist',
'id': url,
'title': try_call(lambda: doc.find('./channel/title').text),
'description': try_call(lambda: doc.find('./channel/description').text),
'entries': entries,
}
@classmethod
def _kvs_get_real_url(cls, video_url, license_code):
if not video_url.startswith('function/0/'):
return video_url # not obfuscated
parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
license_token = cls._kvs_get_license_token(license_code)
urlparts = parsed.path.split('/')
HASH_LENGTH = 32
hash_ = urlparts[3][:HASH_LENGTH]
indices = list(range(HASH_LENGTH))
# Swap indices of hash according to the destination calculated from the license token
accum = 0
for src in reversed(range(HASH_LENGTH)):
accum += license_token[src]
dest = (src + accum) % HASH_LENGTH
indices[src], indices[dest] = indices[dest], indices[src]
urlparts[3] = ''.join(hash_[index] for index in indices) + urlparts[3][HASH_LENGTH:]
return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))
@staticmethod
def _kvs_get_license_token(license_code):
license_code = license_code.replace('$', '')
license_values = [int(char) for char in license_code]
modlicense = license_code.replace('0', '1')
center = len(modlicense) // 2
fronthalf = int(modlicense[:center + 1])
backhalf = int(modlicense[center:])
modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1]
return [
(license_values[index + offset] + current) % 10
for index, current in enumerate(map(int, modlicense))
for offset in range(4)
]
def _extract_kvs(self, url, webpage, video_id):
flashvars = self._search_json(
r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)',
webpage, 'flashvars', video_id, transform_source=js_to_json)
# extract the part after the last / as the display_id from the
# canonical URL.
display_id = self._search_regex(
r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
webpage, 'display_id', fatal=False)
title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
thumbnail = flashvars['preview_url']
if thumbnail.startswith('//'):
protocol, _, _ = url.partition('/')
thumbnail = protocol + thumbnail
url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
formats = []
for key in url_keys:
if '/get_file/' not in flashvars[key]:
continue
format_id = flashvars.get(f'{key}_text', key)
formats.append({
'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])),
'format_id': format_id,
'ext': 'mp4',
**(parse_resolution(format_id) or parse_resolution(flashvars[key])),
'http_headers': {'Referer': url},
})
if not formats[-1].get('height'):
formats[-1]['quality'] = 1
return {
'id': flashvars['video_id'],
'display_id': display_id,
'title': title,
'thumbnail': urljoin(url, thumbnail),
'formats': formats,
}
def _real_extract(self, url):
if url.startswith('//'):
return self.url_result(self.http_scheme() + url)
parsed_url = urllib.parse.urlparse(url)
if not parsed_url.scheme:
default_search = self.get_param('default_search')
if default_search is None:
default_search = 'fixup_error'
if default_search in ('auto', 'auto_warning', 'fixup_error'):
if re.match(r'[^\s/]+\.[^\s/]+/', url):
self.report_warning('The url doesn\'t specify the protocol, trying with http')
return self.url_result('http://' + url)
elif default_search != 'fixup_error':
if default_search == 'auto_warning':
if re.match(r'^(?:url|URL)$', url):
raise ExtractorError(
f'Invalid URL: {url!r} . Call yt-dlp like this: yt-dlp -v "https://www.youtube.com/watch?v=BaW_jenozKc" ',
expected=True)
else:
self.report_warning(
f'Falling back to youtube search for {url} . Set --default-search "auto" to suppress this warning.')
return self.url_result('ytsearch:' + url)
if default_search in ('error', 'fixup_error'):
raise ExtractorError(
f'{url!r} is not a valid URL. '
f'Set --default-search "ytsearch" (or run yt-dlp "ytsearch:{url}" ) to search YouTube', expected=True)
else:
if ':' not in default_search:
default_search += ':'
return self.url_result(default_search + url)
original_url = url
url, smuggled_data = unsmuggle_url(url, {})
force_videoid = None
is_intentional = smuggled_data.get('to_generic')
if 'force_videoid' in smuggled_data:
force_videoid = smuggled_data['force_videoid']
video_id = force_videoid
else:
video_id = self._generic_id(url)
# Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335
impersonate = self._configuration_arg('impersonate', ['false'])
if 'false' in impersonate:
impersonate = None
# Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
# making it impossible to download only chunk of the file (yet we need only 512kB to
# test whether it's HTML or not). According to yt-dlp default Accept-Encoding
# that will always result in downloading the whole file that is not desirable.
# Therefore for extraction pass we have to override Accept-Encoding to any in order
# to accept raw bytes and being able to download only a chunk.
# It may probably better to solve this by checking Content-Type for application/octet-stream
# after a HEAD request, but not sure if we can rely on this.
try:
full_response = self._request_webpage(url, video_id, headers=filter_dict({
'Accept-Encoding': 'identity',
'Referer': smuggled_data.get('referer'),
}), impersonate=impersonate)
except ExtractorError as e:
if not (isinstance(e.cause, HTTPError) and e.cause.status == 403
and e.cause.response.get_header('cf-mitigated') == 'challenge'
and e.cause.response.extensions.get('impersonate') is None):
raise
cf_cookie_domain = traverse_obj(
LenientSimpleCookie(e.cause.response.get_header('set-cookie')),
('__cf_bm', 'domain'))
if cf_cookie_domain:
self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}')
self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm')
msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; '
if not self._downloader._impersonate_target_available(ImpersonateTarget()):
msg += ('see https://github.com/yt-dlp/yt-dlp#impersonation for '
'how to install the required impersonation dependency, and ')
raise ExtractorError(
f'{msg}try again with --extractor-args "generic:impersonate"', expected=True)
new_url = full_response.url
if new_url != extract_basic_auth(url)[0]:
self.report_following_redirect(new_url)
if force_videoid:
new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
return self.url_result(new_url)
info_dict = {
'id': video_id,
'title': self._generic_title(url),
'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')),
}
# Check for direct link to a video
content_type = full_response.headers.get('Content-Type', '').lower()
m = re.match(r'(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m:
self.report_detected('direct video link')
headers = filter_dict({'Referer': smuggled_data.get('referer')})
format_id = str(m.group('format_id'))
ext = determine_ext(url, default_ext=None) or urlhandle_detect_ext(full_response)
subtitles = {}
if format_id.endswith('mpegurl') or ext == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
elif format_id == 'f4m' or ext == 'f4m':
formats = self._extract_f4m_formats(url, video_id, headers=headers)
# Don't check for DASH/mpd here, do it later w/ first_bytes. Same number of requests either way
else:
formats = [{
'format_id': format_id,
'url': url,
'ext': ext,
'vcodec': 'none' if m.group('type') == 'audio' else None,
}]
info_dict['direct'] = True
info_dict.update({
'formats': formats,
'subtitles': subtitles,
'http_headers': headers or None,
})
self._extra_manifest_info(info_dict, url)
return info_dict
if not self.get_param('test', False) and not is_intentional:
force = self.get_param('force_generic_extractor', False)
self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))
first_bytes = full_response.read(512)
# Is it an M3U playlist?
if first_bytes.startswith(b'#EXTM3U'):
self.report_detected('M3U playlist')
info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
self._extra_manifest_info(info_dict, url)
return info_dict
# Maybe it's a direct link to a video?
# Be careful not to download the whole thing!
if not is_html(first_bytes):
self.report_warning(
'URL could be a direct video link, returning it as such.')
ext = determine_ext(url)
if ext not in _UnsafeExtensionError.ALLOWED_EXTENSIONS:
ext = 'unknown_video'
info_dict.update({
'direct': True,
'url': url,
'ext': ext,
})
return info_dict
webpage = self._webpage_read_content(
full_response, url, video_id, prefix=first_bytes)
if '<title>DPG Media Privacy Gate</title>' in webpage:
webpage = self._download_webpage(url, video_id)
self.report_extraction(video_id)
# Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
try:
try:
doc = compat_etree_fromstring(webpage)
except xml.etree.ElementTree.ParseError:
doc = compat_etree_fromstring(webpage.encode())
if doc.tag == 'rss':
self.report_detected('RSS feed')
return self._extract_rss(url, video_id, doc)
elif doc.tag == 'SmoothStreamingMedia':
info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
self.report_detected('ISM manifest')
return info_dict
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
smil = self._parse_smil(doc, url, video_id)
self.report_detected('SMIL file')
return smil
elif doc.tag == '{http://xspf.org/ns/0/}playlist':
self.report_detected('XSPF playlist')
return self.playlist_result(
self._parse_xspf(
doc, video_id, xspf_url=url,
xspf_base_url=new_url),
video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
doc,
# Do not use yt_dlp.utils.base_url here since it will raise on file:// URLs
mpd_base_url=update_url(new_url, query=None, fragment=None).rpartition('/')[0],
mpd_url=url)
info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None
self._extra_manifest_info(info_dict, url)
self.report_detected('DASH manifest')
return info_dict
elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
self.report_detected('F4M manifest')
return info_dict
except xml.etree.ElementTree.ParseError:
pass
info_dict.update({
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
'title': self._generic_title('', webpage, default='video'),
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'age_limit': self._rta_search(webpage),
})
self._downloader.write_debug('Looking for embeds')
embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
if len(embeds) == 1:
return merge_dicts(embeds[0], info_dict)
elif embeds:
return self.playlist_result(embeds, **info_dict)
raise UnsupportedError(url)
def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
"""Returns an iterator of video entries"""
info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation
video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
url, smuggled_data = unsmuggle_url(url, {})
actual_url = urlh.url if urlh else url
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
# Unescaping the whole page allows to handle those cases in a generic way
# FIXME: unescaping the whole page may break URLs, commenting out for now.
# There probably should be a second run of generic extractor on unescaped webpage.
# webpage = urllib.parse.unquote(webpage)
embeds = []
for ie in self._downloader._ies.values():
if ie.ie_key() in smuggled_data.get('block_ies', []):
continue
gen = ie.extract_from_webpage(self._downloader, url, webpage)
current_embeds = []
try:
while True:
current_embeds.append(next(gen))
except self.StopExtraction:
self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
embeds and 'discarding other embeds')
return current_embeds
except StopIteration:
self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
embeds.extend(current_embeds)
if embeds:
return embeds
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
if jwplayer_data:
if isinstance(jwplayer_data.get('playlist'), str):
self.report_detected('JW Player playlist')
return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')]
try:
info = self._parse_jwplayer_data(
jwplayer_data, video_id, require_title=False, base_url=url)
if traverse_obj(info, 'formats', ('entries', ..., 'formats')):
self.report_detected('JW Player data')
return [info]
except ExtractorError:
# See https://github.com/ytdl-org/youtube-dl/pull/16735
pass
# Video.js embed
mobj = re.search(
r'(?s)\bvideojs\s*\(.+?([a-zA-Z0-9_$]+)\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
webpage)
if mobj is not None:
varname = mobj.group(1)
sources = variadic(self._parse_json(
mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
formats, subtitles, src = [], {}, None
for source in sources:
src = source.get('src')
if not src or not isinstance(src, str):
continue
src = urllib.parse.urljoin(url, src)
src_type = source.get('type')
if isinstance(src_type, str):
src_type = src_type.lower()
ext = determine_ext(src).lower()
if src_type == 'video/youtube':
return [self.url_result(src, YoutubeIE.ie_key())]
if src_type == 'application/dash+xml' or ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles(
src, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
src, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if not formats:
formats.append({
'url': src,
'ext': (mimetype2ext(src_type)
or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
'http_headers': {
'Referer': actual_url,
},
})
# https://docs.videojs.com/player#addRemoteTextTrack
# https://html.spec.whatwg.org/multipage/media.html#htmltrackelement
for sub_match in re.finditer(rf'(?s){re.escape(varname)}' + r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
sub = self._parse_json(
sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
sub_src = str_or_none(sub.get('src'))
if not sub_src:
continue
subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
'url': urllib.parse.urljoin(url, sub_src),
'name': sub.get('label'),
'http_headers': {
'Referer': actual_url,
},
})
if formats or subtitles:
self.report_detected('video.js embed')
info_dict = {'formats': formats, 'subtitles': subtitles}
if formats:
self._extra_manifest_info(info_dict, src)
return [info_dict]
# Look for generic KVS player (before json-ld bc of some urls that break otherwise)
found = self._search_regex((
r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:(?!\1)[^?#])+/kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:(?!\2)[^?#])+/kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
), webpage, 'KVS player', group='ver', default=False)
if found:
self.report_detected('KVS Player')
if found.split('.')[0] not in ('4', '5', '6'):
self.report_warning(f'Untested major version ({found}) in player engine - download may fail.')
return [self._extract_kvs(url, webpage, video_id)]
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(webpage, video_id, default={})
if json_ld.get('url') not in (url, None):
self.report_detected('JSON LD')
is_direct = json_ld.get('ext') not in (None, *MEDIA_EXTENSIONS.manifests)
return [merge_dicts({
'_type': 'video' if is_direct else 'url_transparent',
'url': smuggle_url(json_ld['url'], {
'force_videoid': video_id,
'to_generic': True,
'referer': url,
}),
}, json_ld)]
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
if RtmpIE.suitable(vurl):
return True
vpath = urllib.parse.urlparse(vurl).path
vext = determine_ext(vpath, None)
return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
def filter_video(urls):
return list(filter(check_video, urls))
# Start with something easy: JW Player in SWFObject
found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
if found:
self.report_detected('JW Player in SFWObject')
else:
# Look for gorilla-vid style embedding
found = filter_video(re.findall(r'''(?sx)
(?:
jw_plugins|
JWPlayerOptions|
jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
)
.*?
['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
if found:
self.report_detected('JW Player embed')
if not found:
# Broaden the search a little bit
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
if found:
self.report_detected('video file')
if not found:
# Broaden the findall a little bit: JWPlayer JS loader
found = filter_video(re.findall(
r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
if found:
self.report_detected('JW Player JS loader')
if not found:
# Flow player
found = filter_video(re.findall(r'''(?xs)
flowplayer\("[^"]+",\s*
\{[^}]+?\}\s*,
\s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
["']?url["']?\s*:\s*["']([^"']+)["']
''', webpage))
if found:
self.report_detected('Flow Player')
if not found:
# Cinerama player
found = re.findall(
r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
if found:
self.report_detected('Cinerama player')
if not found:
# Try to find twitter cards info
# twitter:player:stream should be checked before twitter:player since
# it is expected to contain a raw stream (see
# https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
found = filter_video(re.findall(
r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
if found:
self.report_detected('Twitter card')
if not found:
# We look for Open Graph info:
# We have to match any number spaces between elements, some sites try to align them, e.g.: statigr.am
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage))
if found:
self.report_detected('Open Graph video info')
if not found:
REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
found = re.search(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
rf'(?:[a-z-]+="[^"]+"\s+)*?content="{REDIRECT_REGEX}',
webpage)
if not found:
# Look also in Refresh HTTP header
refresh_header = urlh and urlh.headers.get('Refresh')
if refresh_header:
found = re.search(REDIRECT_REGEX, refresh_header)
if found:
new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1)))
if new_url != url:
self.report_following_redirect(new_url)
return [self.url_result(new_url)]
else:
found = None
if not found:
# twitter:player is a https URL to iframe player that may or may not
# be supported by yt-dlp thus this is checked the very last (see
# https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
embed_url = self._html_search_meta('twitter:player', webpage, default=None)
if embed_url and embed_url != url:
self.report_detected('twitter:player iframe')
return [self.url_result(embed_url)]
if not found:
return []
domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None)
entries = []
for video_url in orderedSet(found):
video_url = video_url.encode().decode('unicode-escape')
video_url = unescapeHTML(video_url)
video_url = video_url.replace('\\/', '/')
video_url = urllib.parse.urljoin(url, video_url)
video_id = urllib.parse.unquote(os.path.basename(video_url))
# Sometimes, jwplayer extraction will result in a YouTube URL
if YoutubeIE.suitable(video_url):
entries.append(self.url_result(video_url, 'Youtube'))
continue
video_id = os.path.splitext(video_id)[0]
headers = {
'referer': actual_url,
}
entry_info_dict = {
'id': video_id,
'uploader': domain_name,
'title': info_dict['title'],
'age_limit': info_dict['age_limit'],
'http_headers': headers,
}
if RtmpIE.suitable(video_url):
entry_info_dict.update({
'_type': 'url_transparent',
'ie_key': RtmpIE.ie_key(),
'url': video_url,
})
entries.append(entry_info_dict)
continue
ext = determine_ext(video_url)
if ext == 'smil':
entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict}
elif ext == 'xspf':
return [self._extract_xspf_playlist(video_url, video_id)]
elif ext == 'm3u8':
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
self._extra_manifest_info(entry_info_dict, video_url)
elif ext == 'mpd':
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
self._extra_manifest_info(entry_info_dict, video_url)
elif ext == 'f4m':
entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
# Just matching .ism/manifest is not enough to be reliably sure
# whether it's actually an ISM manifest or some other streaming
# manifest since there are various streaming URL formats
# possible (see [1]) as well as some other shenanigans like
# .smil/manifest URLs that actually serve an ISM (see [2]) and
# so on.
# Thus the most reasonable way to solve this is to delegate
# to generic extractor in order to look into the contents of
# the manifest itself.
# 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats
# 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest
entry_info_dict = self.url_result(
smuggle_url(video_url, {'to_generic': True}),
GenericIE.ie_key())
else:
entry_info_dict['url'] = video_url
entries.append(entry_info_dict)
if len(entries) > 1:
for num, e in enumerate(entries, start=1):
# 'url' results don't have a title
if e.get('title') is not None:
e['title'] = '{} ({})'.format(e['title'], num)
return entries