[skip travis] renaming

to avoid using same folder when using pip install for example
This commit is contained in:
Unknown
2020-09-02 20:25:25 +02:00
parent 9688f23716
commit cefecac12c
861 changed files with 452 additions and 452 deletions

View File

@@ -0,0 +1,46 @@
from __future__ import unicode_literals
try:
from .lazy_extractors import *
from .lazy_extractors import _ALL_CLASSES
_LAZY_LOADER = True
except ImportError:
_LAZY_LOADER = False
from .extractors import *
_ALL_CLASSES = [
klass
for name, klass in globals().items()
if name.endswith('IE') and name != 'GenericIE'
]
_ALL_CLASSES.append(GenericIE)
def gen_extractor_classes():
""" Return a list of supported extractors.
The order does matter; the first extractor matched is the one handling the URL.
"""
return _ALL_CLASSES
def gen_extractors():
""" Return a list of an instance of every supported extractor.
The order does matter; the first extractor matched is the one handling the URL.
"""
return [klass() for klass in gen_extractor_classes()]
def list_extractors(age_limit):
"""
Return a list of extractors that are suitable for the given age,
sorted by extractor ID.
"""
return sorted(
filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()),
key=lambda ie: ie.IE_NAME.lower())
def get_info_extractor(ie_name):
"""Returns the info extractor class with the given ie_name"""
return globals()[ie_name + 'IE']

View File

@@ -0,0 +1,193 @@
from __future__ import unicode_literals
import hashlib
import hmac
import re
import time
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
js_to_json,
int_or_none,
parse_iso8601,
try_get,
unescapeHTML,
update_url_query,
)
class ABCIE(InfoExtractor):
IE_NAME = 'abc.net.au'
_VALID_URL = r'https?://(?:www\.)?abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
'md5': 'cb3dd03b18455a661071ee1e28344d9f',
'info_dict': {
'id': '5868334',
'ext': 'mp4',
'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',
'description': 'md5:809ad29c67a05f54eb41f2a105693a67',
},
'skip': 'this video has expired',
}, {
'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326',
'md5': 'db2a5369238b51f9811ad815b69dc086',
'info_dict': {
'id': 'NvqvPeNZsHU',
'ext': 'mp4',
'upload_date': '20150816',
'uploader': 'ABC News (Australia)',
'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef',
'uploader_id': 'NewsOnABC',
'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill',
},
'add_ie': ['Youtube'],
'skip': 'Not accessible from Travis CI server',
}, {
'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080',
'md5': 'b96eee7c9edf4fc5a358a0252881cc1f',
'info_dict': {
'id': '6880080',
'ext': 'mp3',
'title': 'NAB lifts interest rates, following Westpac and CBA',
'description': 'md5:f13d8edc81e462fce4a0437c7dc04728',
},
}, {
'url': 'http://www.abc.net.au/news/2015-10-19/6866214',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
mobj = re.search(
r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);',
webpage)
if mobj is None:
expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None)
if expired:
raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True)
raise ExtractorError('Unable to extract video urls')
urls_info = self._parse_json(
mobj.group('json_data'), video_id, transform_source=js_to_json)
if not isinstance(urls_info, list):
urls_info = [urls_info]
if mobj.group('type') == 'YouTube':
return self.playlist_result([
self.url_result(url_info['url']) for url_info in urls_info])
formats = [{
'url': url_info['url'],
'vcodec': url_info.get('codec') if mobj.group('type') == 'Video' else 'none',
'width': int_or_none(url_info.get('width')),
'height': int_or_none(url_info.get('height')),
'tbr': int_or_none(url_info.get('bitrate')),
'filesize': int_or_none(url_info.get('filesize')),
} for url_info in urls_info]
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(webpage),
'formats': formats,
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}
class ABCIViewIE(InfoExtractor):
IE_NAME = 'abc.net.au:iview'
_VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P<id>[^/?#]+)'
_GEO_COUNTRIES = ['AU']
# ABC iview programs are normally available for 14 days only.
_TESTS = [{
'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00',
'md5': '67715ce3c78426b11ba167d875ac6abf',
'info_dict': {
'id': 'LE1927H001S00',
'ext': 'mp4',
'title': "Series 11 Ep 1",
'series': "Gruen",
'description': 'md5:52cc744ad35045baf6aded2ce7287f67',
'upload_date': '20190925',
'uploader_id': 'abc1',
'timestamp': 1569445289,
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_params = self._download_json(
'https://iview.abc.net.au/api/programs/' + video_id, video_id)
title = unescapeHTML(video_params.get('title') or video_params['seriesTitle'])
stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream'))
house_number = video_params.get('episodeHouseNumber') or video_id
path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format(
int(time.time()), house_number)
sig = hmac.new(
b'android.content.res.Resources',
path.encode('utf-8'), hashlib.sha256).hexdigest()
token = self._download_webpage(
'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id)
def tokenize_url(url, token):
return update_url_query(url, {
'hdnea': token,
})
for sd in ('720', 'sd', 'sd-low'):
sd_url = try_get(
stream, lambda x: x['streams']['hls'][sd], compat_str)
if not sd_url:
continue
formats = self._extract_m3u8_formats(
tokenize_url(sd_url, token), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
if formats:
break
self._sort_formats(formats)
subtitles = {}
src_vtt = stream.get('captions', {}).get('src-vtt')
if src_vtt:
subtitles['en'] = [{
'url': src_vtt,
'ext': 'vtt',
}]
is_live = video_params.get('livestream') == '1'
if is_live:
title = self._live_title(title)
return {
'id': video_id,
'title': title,
'description': video_params.get('description'),
'thumbnail': video_params.get('thumbnail'),
'duration': int_or_none(video_params.get('eventDuration')),
'timestamp': parse_iso8601(video_params.get('pubDate'), ' '),
'series': unescapeHTML(video_params.get('seriesTitle')),
'series_id': video_params.get('seriesHouseNumber') or video_id[:7],
'season_number': int_or_none(self._search_regex(
r'\bSeries\s+(\d+)\b', title, 'season number', default=None)),
'episode_number': int_or_none(self._search_regex(
r'\bEp\s+(\d+)\b', title, 'episode number', default=None)),
'episode_id': house_number,
'uploader_id': video_params.get('channel'),
'formats': formats,
'subtitles': subtitles,
'is_live': is_live,
}

View File

@@ -0,0 +1,148 @@
# coding: utf-8
from __future__ import unicode_literals
import calendar
import re
import time
from .amp import AMPIE
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import compat_urlparse
class AbcNewsVideoIE(AMPIE):
IE_NAME = 'abcnews:video'
_VALID_URL = r'''(?x)
https?://
(?:
abcnews\.go\.com/
(?:
[^/]+/video/(?P<display_id>[0-9a-z-]+)-|
video/embed\?.*?\bid=
)|
fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/
)
(?P<id>\d+)
'''
_TESTS = [{
'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',
'info_dict': {
'id': '20411932',
'ext': 'mp4',
'display_id': 'week-exclusive-irans-foreign-minister-zarif',
'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif',
'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',
'duration': 180,
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://abcnews.go.com/video/embed?id=46979033',
'only_matching': True,
}, {
'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
video_id = mobj.group('id')
info_dict = self._extract_feed_info(
'http://abcnews.go.com/video/itemfeed?id=%s' % video_id)
info_dict.update({
'id': video_id,
'display_id': display_id,
})
return info_dict
class AbcNewsIE(InfoExtractor):
IE_NAME = 'abcnews'
_VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'
_TESTS = [{
'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY',
'info_dict': {
'id': '10505354',
'ext': 'flv',
'display_id': 'dramatic-video-rare-death-job-america',
'title': 'Occupational Hazards',
'description': 'Nightline investigates the dangers that lurk at various jobs.',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20100428',
'timestamp': 1272412800,
},
'add_ie': ['AbcNewsVideo'],
}, {
'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
'info_dict': {
'id': '38897857',
'ext': 'mp4',
'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',
'title': 'Justin Timberlake Drops Hints For Secret Single',
'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
'upload_date': '20160515',
'timestamp': 1463329500,
},
'params': {
# m3u8 download
'skip_download': True,
# The embedded YouTube video is blocked due to copyright issues
'playlist_items': '1',
},
'add_ie': ['AbcNewsVideo'],
}, {
'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
full_video_url = compat_urlparse.urljoin(url, video_url)
youtube_url = YoutubeIE._extract_url(webpage)
timestamp = None
date_str = self._html_search_regex(
r'<span[^>]+class="timestamp">([^<]+)</span>',
webpage, 'timestamp', fatal=False)
if date_str:
tz_offset = 0
if date_str.endswith(' ET'): # Eastern Time
tz_offset = -5
date_str = date_str[:-3]
date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p']
for date_format in date_formats:
try:
timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format))
except ValueError:
continue
if timestamp is not None:
timestamp -= tz_offset * 3600
entry = {
'_type': 'url_transparent',
'ie_key': AbcNewsVideoIE.ie_key(),
'url': full_video_url,
'id': video_id,
'display_id': display_id,
'timestamp': timestamp,
}
if youtube_url:
entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())]
return self.playlist_result(entries)
return entry

View File

@@ -0,0 +1,137 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
dict_get,
int_or_none,
try_get,
)
class ABCOTVSIE(InfoExtractor):
IE_NAME = 'abcotvs'
IE_DESC = 'ABC Owned Television Stations'
_VALID_URL = r'https?://(?P<site>abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P<display_id>[^/]+))?/(?P<id>\d+)'
_TESTS = [
{
'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/',
'info_dict': {
'id': '472548',
'display_id': 'east-bay-museum-celebrates-vintage-synthesizers',
'ext': 'mp4',
'title': 'East Bay museum celebrates synthesized music',
'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3',
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1421118520,
'upload_date': '20150113',
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://abc7news.com/472581',
'only_matching': True,
},
{
'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/',
'only_matching': True,
},
]
_SITE_MAP = {
'6abc': 'wpvi',
'abc11': 'wtvd',
'abc13': 'ktrk',
'abc30': 'kfsn',
'abc7': 'kabc',
'abc7chicago': 'wls',
'abc7news': 'kgo',
'abc7ny': 'wabc',
}
def _real_extract(self, url):
site, display_id, video_id = re.match(self._VALID_URL, url).groups()
display_id = display_id or video_id
station = self._SITE_MAP[site]
data = self._download_json(
'https://api.abcotvs.com/v2/content', display_id, query={
'id': video_id,
'key': 'otv.web.%s.story' % station,
'station': station,
})['data']
video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data
video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id))
title = video.get('title') or video['linkText']
formats = []
m3u8_url = video.get('m3u8')
if m3u8_url:
formats = self._extract_m3u8_formats(
video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False)
mp4_url = video.get('mp4')
if mp4_url:
formats.append({
'abr': 128,
'format_id': 'https',
'height': 360,
'url': mp4_url,
'width': 640,
})
self._sort_formats(formats)
image = video.get('image') or {}
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': dict_get(video, ('description', 'caption'), try_get(video, lambda x: x['meta']['description'])),
'thumbnail': dict_get(image, ('source', 'dynamicSource')),
'timestamp': int_or_none(video.get('date')),
'duration': int_or_none(video.get('length')),
'formats': formats,
}
class ABCOTVSClipsIE(InfoExtractor):
IE_NAME = 'abcotvs:clips'
_VALID_URL = r'https?://clips\.abcotvs\.com/(?:[^/]+/)*video/(?P<id>\d+)'
_TEST = {
'url': 'https://clips.abcotvs.com/kabc/video/214814',
'info_dict': {
'id': '214814',
'ext': 'mp4',
'title': 'SpaceX launch pad explosion destroys rocket, satellite',
'description': 'md5:9f186e5ad8f490f65409965ee9c7be1b',
'upload_date': '20160901',
'timestamp': 1472756695,
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json('https://clips.abcotvs.com/vogo/video/getByIds?ids=' + video_id, video_id)['results'][0]
title = video_data['title']
formats = self._extract_m3u8_formats(
video_data['videoURL'].split('?')[0], video_id, 'mp4')
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': video_data.get('description'),
'thumbnail': video_data.get('thumbnailURL'),
'duration': int_or_none(video_data.get('duration')),
'timestamp': int_or_none(video_data.get('pubDate')),
'formats': formats,
}

View File

@@ -0,0 +1,41 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class AcademicEarthCourseIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
IE_NAME = 'AcademicEarth:Course'
_TEST = {
'url': 'http://academicearth.org/playlists/laws-of-nature/',
'info_dict': {
'id': 'laws-of-nature',
'title': 'Laws of Nature',
'description': 'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.',
},
'playlist_count': 3,
}
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
title = self._html_search_regex(
r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
r'<p class="excerpt"[^>]*?>(.*?)</p>',
webpage, 'description', fatal=False)
urls = re.findall(
r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">',
webpage)
entries = [self.url_result(u) for u in urls]
return {
'_type': 'playlist',
'id': playlist_id,
'title': title,
'description': description,
'entries': entries,
}

View File

@@ -0,0 +1,135 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import functools
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
clean_html,
float_or_none,
int_or_none,
try_get,
unified_timestamp,
OnDemandPagedList,
)
class ACastIE(InfoExtractor):
IE_NAME = 'acast'
_VALID_URL = r'''(?x)
https?://
(?:
(?:(?:embed|www)\.)?acast\.com/|
play\.acast\.com/s/
)
(?P<channel>[^/]+)/(?P<id>[^/#?]+)
'''
_TESTS = [{
'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna',
'md5': '16d936099ec5ca2d5869e3a813ee8dc4',
'info_dict': {
'id': '2a92b283-1a75-4ad8-8396-499c641de0d9',
'ext': 'mp3',
'title': '2. Raggarmordet - Röster ur det förflutna',
'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4',
'timestamp': 1477346700,
'upload_date': '20161024',
'duration': 2766.602563,
'creator': 'Anton Berg & Martin Johnson',
'series': 'Spår',
'episode': '2. Raggarmordet - Röster ur det förflutna',
}
}, {
'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015',
'only_matching': True,
}, {
'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22',
'only_matching': True,
}, {
'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9',
'only_matching': True,
}]
def _real_extract(self, url):
channel, display_id = re.match(self._VALID_URL, url).groups()
s = self._download_json(
'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id),
display_id)
media_url = s['url']
if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id):
episode_url = s.get('episodeUrl')
if episode_url:
display_id = episode_url
else:
channel, display_id = re.match(self._VALID_URL, s['link']).groups()
cast_data = self._download_json(
'https://play-api.acast.com/splash/%s/%s' % (channel, display_id),
display_id)['result']
e = cast_data['episode']
title = e.get('name') or s['title']
return {
'id': compat_str(e['id']),
'display_id': display_id,
'url': media_url,
'title': title,
'description': e.get('summary') or clean_html(e.get('description') or s.get('description')),
'thumbnail': e.get('image'),
'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')),
'duration': float_or_none(e.get('duration') or s.get('duration')),
'filesize': int_or_none(e.get('contentLength')),
'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str),
'series': try_get(cast_data, lambda x: x['show']['name'], compat_str),
'season_number': int_or_none(e.get('seasonNumber')),
'episode': title,
'episode_number': int_or_none(e.get('episodeNumber')),
}
class ACastChannelIE(InfoExtractor):
IE_NAME = 'acast:channel'
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?acast\.com/|
play\.acast\.com/s/
)
(?P<id>[^/#?]+)
'''
_TESTS = [{
'url': 'https://www.acast.com/todayinfocus',
'info_dict': {
'id': '4efc5294-5385-4847-98bd-519799ce5786',
'title': 'Today in Focus',
'description': 'md5:9ba5564de5ce897faeb12963f4537a64',
},
'playlist_mincount': 35,
}, {
'url': 'http://play.acast.com/s/ft-banking-weekly',
'only_matching': True,
}]
_API_BASE_URL = 'https://play.acast.com/api/'
_PAGE_SIZE = 10
@classmethod
def suitable(cls, url):
return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url)
def _fetch_page(self, channel_slug, page):
casts = self._download_json(
self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page),
channel_slug, note='Download page %d of channel data' % page)
for cast in casts:
yield self.url_result(
'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']),
'ACast', cast['id'])
def _real_extract(self, url):
channel_slug = self._match_id(url)
channel_data = self._download_json(
self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug)
entries = OnDemandPagedList(functools.partial(
self._fetch_page, channel_slug), self._PAGE_SIZE)
return self.playlist_result(entries, compat_str(
channel_data['id']), channel_data['name'], channel_data.get('description'))

View File

@@ -0,0 +1,207 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
import binascii
import json
import os
import random
from .common import InfoExtractor
from ..aes import aes_cbc_decrypt
from ..compat import (
compat_b64decode,
compat_ord,
)
from ..utils import (
bytes_to_intlist,
bytes_to_long,
ExtractorError,
float_or_none,
intlist_to_bytes,
long_to_bytes,
pkcs1pad,
strip_or_none,
urljoin,
)
class ADNIE(InfoExtractor):
IE_DESC = 'Anime Digital Network'
_VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
_TEST = {
'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
'md5': 'e497370d847fd79d9d4c74be55575c7a',
'info_dict': {
'id': '7778',
'ext': 'mp4',
'title': 'Blue Exorcist - Kyôto Saga - Épisode 1',
'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',
}
}
_BASE_URL = 'http://animedigitalnetwork.fr'
_RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537)
_POS_ALIGN_MAP = {
'start': 1,
'end': 3,
}
_LINE_ALIGN_MAP = {
'middle': 8,
'end': 4,
}
@staticmethod
def _ass_subtitles_timecode(seconds):
return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100)
def _get_subtitles(self, sub_path, video_id):
if not sub_path:
return None
enc_subtitles = self._download_webpage(
urljoin(self._BASE_URL, sub_path),
video_id, 'Downloading subtitles location', fatal=False) or '{}'
subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location')
if subtitle_location:
enc_subtitles = self._download_webpage(
urljoin(self._BASE_URL, subtitle_location),
video_id, 'Downloading subtitles data', fatal=False,
headers={'Origin': 'https://animedigitalnetwork.fr'})
if not enc_subtitles:
return None
# http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
bytes_to_intlist(compat_b64decode(enc_subtitles[24:])),
bytes_to_intlist(binascii.unhexlify(self._K + '4b8ef13ec1872730')),
bytes_to_intlist(compat_b64decode(enc_subtitles[:24]))
))
subtitles_json = self._parse_json(
dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(),
None, fatal=False)
if not subtitles_json:
return None
subtitles = {}
for sub_lang, sub in subtitles_json.items():
ssa = '''[Script Info]
ScriptType:V4.00
[V4 Styles]
Format: Name,Fontname,Fontsize,PrimaryColour,SecondaryColour,TertiaryColour,BackColour,Bold,Italic,BorderStyle,Outline,Shadow,Alignment,MarginL,MarginR,MarginV,AlphaLevel,Encoding
Style: Default,Arial,18,16777215,16777215,16777215,0,-1,0,1,1,0,2,20,20,20,0,0
[Events]
Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
for current in sub:
start, end, text, line_align, position_align = (
float_or_none(current.get('startTime')),
float_or_none(current.get('endTime')),
current.get('text'), current.get('lineAlign'),
current.get('positionAlign'))
if start is None or end is None or text is None:
continue
alignment = self._POS_ALIGN_MAP.get(position_align, 2) + self._LINE_ALIGN_MAP.get(line_align, 0)
ssa += os.linesep + 'Dialogue: Marked=0,%s,%s,Default,,0,0,0,,%s%s' % (
self._ass_subtitles_timecode(start),
self._ass_subtitles_timecode(end),
'{\\a%d}' % alignment if alignment != 2 else '',
text.replace('\n', '\\N').replace('<i>', '{\\i1}').replace('</i>', '{\\i0}'))
if sub_lang == 'vostf':
sub_lang = 'fr'
subtitles.setdefault(sub_lang, []).extend([{
'ext': 'json',
'data': json.dumps(sub),
}, {
'ext': 'ssa',
'data': ssa,
}])
return subtitles
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
player_config = self._parse_json(self._search_regex(
r'playerConfig\s*=\s*({.+});', webpage,
'player config', default='{}'), video_id, fatal=False)
if not player_config:
config_url = urljoin(self._BASE_URL, self._search_regex(
r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"',
webpage, 'config url'))
player_config = self._download_json(
config_url, video_id,
'Downloading player config JSON metadata')['player']
video_info = {}
video_info_str = self._search_regex(
r'videoInfo\s*=\s*({.+});', webpage,
'video info', fatal=False)
if video_info_str:
video_info = self._parse_json(
video_info_str, video_id, fatal=False) or {}
options = player_config.get('options') or {}
metas = options.get('metas') or {}
links = player_config.get('links') or {}
sub_path = player_config.get('subtitles')
error = None
if not links:
links_url = player_config.get('linksurl') or options['videoUrl']
token = options['token']
self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)])
message = bytes_to_intlist(json.dumps({
'k': self._K,
'e': 60,
't': token,
}))
padded_message = intlist_to_bytes(pkcs1pad(message, 128))
n, e = self._RSA_KEY
encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n))
authorization = base64.b64encode(encrypted_message).decode()
links_data = self._download_json(
urljoin(self._BASE_URL, links_url), video_id,
'Downloading links JSON metadata', headers={
'Authorization': 'Bearer ' + authorization,
})
links = links_data.get('links') or {}
metas = metas or links_data.get('meta') or {}
sub_path = sub_path or links_data.get('subtitles') or \
'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id
sub_path += '&token=' + token
error = links_data.get('error')
title = metas.get('title') or video_info['title']
formats = []
for format_id, qualities in links.items():
if not isinstance(qualities, dict):
continue
for quality, load_balancer_url in qualities.items():
load_balancer_data = self._download_json(
load_balancer_url, video_id,
'Downloading %s %s JSON metadata' % (format_id, quality),
fatal=False) or {}
m3u8_url = load_balancer_data.get('location')
if not m3u8_url:
continue
m3u8_formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False)
if format_id == 'vf':
for f in m3u8_formats:
f['language'] = 'fr'
formats.extend(m3u8_formats)
if not error:
error = options.get('error')
if not formats and error:
raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': strip_or_none(metas.get('summary') or video_info.get('resume')),
'thumbnail': video_info.get('image'),
'formats': formats,
'subtitles': self.extract_subtitles(sub_path, video_id),
'episode': metas.get('subtitle') or video_info.get('videoTitle'),
'series': video_info.get('playlistTitle'),
}

View File

@@ -0,0 +1,37 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urlparse,
)
class AdobeConnectIE(InfoExtractor):
_VALID_URL = r'https?://\w+\.adobeconnect\.com/(?P<id>[\w-]+)'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1])
is_live = qs.get('isLive', ['false'])[0] == 'true'
formats = []
for con_string in qs['conStrings'][0].split(','):
formats.append({
'format_id': con_string.split('://')[0],
'app': compat_urlparse.quote('?' + con_string.split('?')[1] + 'flvplayerapp/' + qs['appInstance'][0]),
'ext': 'flv',
'play_path': 'mp4:' + qs['streamName'][0],
'rtmp_conn': 'S:' + qs['ticket'][0],
'rtmp_live': is_live,
'url': con_string,
})
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
'formats': formats,
'is_live': is_live,
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,288 @@
from __future__ import unicode_literals
import functools
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
float_or_none,
int_or_none,
ISO639Utils,
OnDemandPagedList,
parse_duration,
str_or_none,
str_to_int,
unified_strdate,
)
class AdobeTVBaseIE(InfoExtractor):
def _call_api(self, path, video_id, query, note=None):
return self._download_json(
'http://tv.adobe.com/api/v4/' + path,
video_id, note, query=query)['data']
def _parse_subtitles(self, video_data, url_key):
subtitles = {}
for translation in video_data.get('translations', []):
vtt_path = translation.get(url_key)
if not vtt_path:
continue
lang = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
subtitles.setdefault(lang, []).append({
'ext': 'vtt',
'url': vtt_path,
})
return subtitles
def _parse_video_data(self, video_data):
video_id = compat_str(video_data['id'])
title = video_data['title']
s3_extracted = False
formats = []
for source in video_data.get('videos', []):
source_url = source.get('url')
if not source_url:
continue
f = {
'format_id': source.get('quality_level'),
'fps': int_or_none(source.get('frame_rate')),
'height': int_or_none(source.get('height')),
'tbr': int_or_none(source.get('video_data_rate')),
'width': int_or_none(source.get('width')),
'url': source_url,
}
original_filename = source.get('original_filename')
if original_filename:
if not (f.get('height') and f.get('width')):
mobj = re.search(r'_(\d+)x(\d+)', original_filename)
if mobj:
f.update({
'height': int(mobj.group(2)),
'width': int(mobj.group(1)),
})
if original_filename.startswith('s3://') and not s3_extracted:
formats.append({
'format_id': 'original',
'preference': 1,
'url': original_filename.replace('s3://', 'https://s3.amazonaws.com/'),
})
s3_extracted = True
formats.append(f)
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': video_data.get('description'),
'thumbnail': video_data.get('thumbnail'),
'upload_date': unified_strdate(video_data.get('start_date')),
'duration': parse_duration(video_data.get('duration')),
'view_count': str_to_int(video_data.get('playcount')),
'formats': formats,
'subtitles': self._parse_subtitles(video_data, 'vtt'),
}
class AdobeTVEmbedIE(AdobeTVBaseIE):
IE_NAME = 'adobetv:embed'
_VALID_URL = r'https?://tv\.adobe\.com/embed/\d+/(?P<id>\d+)'
_TEST = {
'url': 'https://tv.adobe.com/embed/22/4153',
'md5': 'c8c0461bf04d54574fc2b4d07ac6783a',
'info_dict': {
'id': '4153',
'ext': 'flv',
'title': 'Creating Graphics Optimized for BlackBerry',
'description': 'md5:eac6e8dced38bdaae51cd94447927459',
'thumbnail': r're:https?://.*\.jpg$',
'upload_date': '20091109',
'duration': 377,
'view_count': int,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._call_api(
'episode/' + video_id, video_id, {'disclosure': 'standard'})[0]
return self._parse_video_data(video_data)
class AdobeTVIE(AdobeTVBaseIE):
IE_NAME = 'adobetv'
_VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?watch/(?P<show_urlname>[^/]+)/(?P<id>[^/]+)'
_TEST = {
'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/',
'md5': '9bc5727bcdd55251f35ad311ca74fa1e',
'info_dict': {
'id': '10981',
'ext': 'mp4',
'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop',
'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311',
'thumbnail': r're:https?://.*\.jpg$',
'upload_date': '20110914',
'duration': 60,
'view_count': int,
},
}
def _real_extract(self, url):
language, show_urlname, urlname = re.match(self._VALID_URL, url).groups()
if not language:
language = 'en'
video_data = self._call_api(
'episode/get', urlname, {
'disclosure': 'standard',
'language': language,
'show_urlname': show_urlname,
'urlname': urlname,
})[0]
return self._parse_video_data(video_data)
class AdobeTVPlaylistBaseIE(AdobeTVBaseIE):
_PAGE_SIZE = 25
def _fetch_page(self, display_id, query, page):
page += 1
query['page'] = page
for element_data in self._call_api(
self._RESOURCE, display_id, query, 'Download Page %d' % page):
yield self._process_data(element_data)
def _extract_playlist_entries(self, display_id, query):
return OnDemandPagedList(functools.partial(
self._fetch_page, display_id, query), self._PAGE_SIZE)
class AdobeTVShowIE(AdobeTVPlaylistBaseIE):
IE_NAME = 'adobetv:show'
_VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?show/(?P<id>[^/]+)'
_TEST = {
'url': 'http://tv.adobe.com/show/the-complete-picture-with-julieanne-kost',
'info_dict': {
'id': '36',
'title': 'The Complete Picture with Julieanne Kost',
'description': 'md5:fa50867102dcd1aa0ddf2ab039311b27',
},
'playlist_mincount': 136,
}
_RESOURCE = 'episode'
_process_data = AdobeTVBaseIE._parse_video_data
def _real_extract(self, url):
language, show_urlname = re.match(self._VALID_URL, url).groups()
if not language:
language = 'en'
query = {
'disclosure': 'standard',
'language': language,
'show_urlname': show_urlname,
}
show_data = self._call_api(
'show/get', show_urlname, query)[0]
return self.playlist_result(
self._extract_playlist_entries(show_urlname, query),
str_or_none(show_data.get('id')),
show_data.get('show_name'),
show_data.get('show_description'))
class AdobeTVChannelIE(AdobeTVPlaylistBaseIE):
IE_NAME = 'adobetv:channel'
_VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?channel/(?P<id>[^/]+)(?:/(?P<category_urlname>[^/]+))?'
_TEST = {
'url': 'http://tv.adobe.com/channel/development',
'info_dict': {
'id': 'development',
},
'playlist_mincount': 96,
}
_RESOURCE = 'show'
def _process_data(self, show_data):
return self.url_result(
show_data['url'], 'AdobeTVShow', str_or_none(show_data.get('id')))
def _real_extract(self, url):
language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups()
if not language:
language = 'en'
query = {
'channel_urlname': channel_urlname,
'language': language,
}
if category_urlname:
query['category_urlname'] = category_urlname
return self.playlist_result(
self._extract_playlist_entries(channel_urlname, query),
channel_urlname)
class AdobeTVVideoIE(AdobeTVBaseIE):
IE_NAME = 'adobetv:video'
_VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
_TEST = {
# From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
'url': 'https://video.tv.adobe.com/v/2456/',
'md5': '43662b577c018ad707a63766462b1e87',
'info_dict': {
'id': '2456',
'ext': 'mp4',
'title': 'New experience with Acrobat DC',
'description': 'New experience with Acrobat DC',
'duration': 248.667,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_data = self._parse_json(self._search_regex(
r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id)
title = video_data['title']
formats = []
sources = video_data.get('sources') or []
for source in sources:
source_src = source.get('src')
if not source_src:
continue
formats.append({
'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000),
'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])),
'height': int_or_none(source.get('height') or None),
'tbr': int_or_none(source.get('bitrate') or None),
'width': int_or_none(source.get('width') or None),
'url': source_src,
})
self._sort_formats(formats)
# For both metadata and downloaded files the duration varies among
# formats. I just pick the max one
duration = max(filter(None, [
float_or_none(source.get('duration'), scale=1000)
for source in sources]))
return {
'id': video_id,
'formats': formats,
'title': title,
'description': video_data.get('description'),
'thumbnail': video_data.get('video', {}).get('poster'),
'duration': duration,
'subtitles': self._parse_subtitles(video_data, 'vttPath'),
}

View File

@@ -0,0 +1,202 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .turner import TurnerBaseIE
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
mimetype2ext,
parse_age_limit,
parse_iso8601,
strip_or_none,
try_get,
)
class AdultSwimIE(TurnerBaseIE):
_VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?'
_TESTS = [{
'url': 'http://adultswim.com/videos/rick-and-morty/pilot',
'info_dict': {
'id': 'rQxZvXQ4ROaSOqq-or2Mow',
'ext': 'mp4',
'title': 'Rick and Morty - Pilot',
'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.',
'timestamp': 1543294800,
'upload_date': '20181127',
},
'params': {
# m3u8 download
'skip_download': True,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
'info_dict': {
'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
'ext': 'mp4',
'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.',
'upload_date': '20080124',
'timestamp': 1201150800,
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': '404 Not Found',
}, {
'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/',
'info_dict': {
'id': 'I0LQFQkaSUaFp8PnAWHhoQ',
'ext': 'mp4',
'title': 'Decker - Inside Decker: A New Hero',
'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.',
'timestamp': 1469480460,
'upload_date': '20160725',
},
'params': {
# m3u8 download
'skip_download': True,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'http://www.adultswim.com/videos/attack-on-titan',
'info_dict': {
'id': 'attack-on-titan',
'title': 'Attack on Titan',
'description': 'md5:41caa9416906d90711e31dc00cb7db7e',
},
'playlist_mincount': 12,
}, {
'url': 'http://www.adultswim.com/videos/streams/williams-stream',
'info_dict': {
'id': 'd8DEBj7QRfetLsRgFnGEyg',
'ext': 'mp4',
'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'description': 'original programming',
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': '404 Not Found',
}]
def _real_extract(self, url):
show_path, episode_path = re.match(self._VALID_URL, url).groups()
display_id = episode_path or show_path
query = '''query {
getShowBySlug(slug:"%s") {
%%s
}
}''' % show_path
if episode_path:
query = query % '''title
getVideoBySlug(slug:"%s") {
_id
auth
description
duration
episodeNumber
launchDate
mediaID
seasonNumber
poster
title
tvRating
}''' % episode_path
['getVideoBySlug']
else:
query = query % '''metaDescription
title
videos(first:1000,sort:["episode_number"]) {
edges {
node {
_id
slug
}
}
}'''
show_data = self._download_json(
'https://www.adultswim.com/api/search', display_id,
data=json.dumps({'query': query}).encode(),
headers={'Content-Type': 'application/json'})['data']['getShowBySlug']
if episode_path:
video_data = show_data['getVideoBySlug']
video_id = video_data['_id']
episode_title = title = video_data['title']
series = show_data.get('title')
if series:
title = '%s - %s' % (series, title)
info = {
'id': video_id,
'title': title,
'description': strip_or_none(video_data.get('description')),
'duration': float_or_none(video_data.get('duration')),
'formats': [],
'subtitles': {},
'age_limit': parse_age_limit(video_data.get('tvRating')),
'thumbnail': video_data.get('poster'),
'timestamp': parse_iso8601(video_data.get('launchDate')),
'series': series,
'season_number': int_or_none(video_data.get('seasonNumber')),
'episode': episode_title,
'episode_number': int_or_none(video_data.get('episodeNumber')),
}
auth = video_data.get('auth')
media_id = video_data.get('mediaID')
if media_id:
info.update(self._extract_ngtv_info(media_id, {
# CDN_TOKEN_APP_ID from:
# https://d2gg02c3xr550i.cloudfront.net/assets/asvp.e9c8bef24322d060ef87.bundle.js
'appId': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhcHBJZCI6ImFzLXR2ZS1kZXNrdG9wLXB0enQ2bSIsInByb2R1Y3QiOiJ0dmUiLCJuZXR3b3JrIjoiYXMiLCJwbGF0Zm9ybSI6ImRlc2t0b3AiLCJpYXQiOjE1MzI3MDIyNzl9.BzSCk-WYOZ2GMCIaeVb8zWnzhlgnXuJTCu0jGp_VaZE',
}, {
'url': url,
'site_name': 'AdultSwim',
'auth_required': auth,
}))
if not auth:
extract_data = self._download_json(
'https://www.adultswim.com/api/shows/v1/videos/' + video_id,
video_id, query={'fields': 'stream'}, fatal=False) or {}
assets = try_get(extract_data, lambda x: x['data']['video']['stream']['assets'], list) or []
for asset in assets:
asset_url = asset.get('url')
if not asset_url:
continue
ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type')))
if ext == 'm3u8':
info['formats'].extend(self._extract_m3u8_formats(
asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
elif ext == 'f4m':
continue
# info['formats'].extend(self._extract_f4m_formats(
# asset_url, video_id, f4m_id='hds', fatal=False))
elif ext in ('scc', 'ttml', 'vtt'):
info['subtitles'].setdefault('en', []).append({
'url': asset_url,
})
self._sort_formats(info['formats'])
return info
else:
entries = []
for edge in show_data.get('videos', {}).get('edges', []):
video = edge.get('node') or {}
slug = video.get('slug')
if not slug:
continue
entries.append(self.url_result(
'http://adultswim.com/videos/%s/%s' % (show_path, slug),
'AdultSwim', video.get('_id')))
return self.playlist_result(
entries, show_path, show_data.get('title'),
strip_or_none(show_data.get('metaDescription')))

View File

@@ -0,0 +1,247 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .theplatform import ThePlatformIE
from ..utils import (
extract_attributes,
ExtractorError,
int_or_none,
smuggle_url,
update_url_query,
)
from ..compat import (
compat_urlparse,
)
class AENetworksBaseIE(ThePlatformIE):
_THEPLATFORM_KEY = 'crazyjava'
_THEPLATFORM_SECRET = 's3cr3t'
def _extract_aen_smil(self, smil_url, video_id, auth=None):
query = {'mbr': 'true'}
if auth:
query['auth'] = auth
TP_SMIL_QUERY = [{
'assetTypes': 'high_video_ak',
'switch': 'hls_high_ak'
}, {
'assetTypes': 'high_video_s3'
}, {
'assetTypes': 'high_video_s3',
'switch': 'hls_ingest_fastly'
}]
formats = []
subtitles = {}
last_e = None
for q in TP_SMIL_QUERY:
q.update(query)
m_url = update_url_query(smil_url, q)
m_url = self._sign_url(m_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET)
try:
tp_formats, tp_subtitles = self._extract_theplatform_smil(
m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes']))
except ExtractorError as e:
last_e = e
continue
formats.extend(tp_formats)
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
if last_e and not formats:
raise last_e
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
}
class AENetworksIE(AENetworksBaseIE):
IE_NAME = 'aenetworks'
IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?
(?P<domain>
(?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com|
fyi\.tv
)/
(?:
shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|
movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?|
specials/(?P<special_display_id>[^/]+)/(?:full-special|preview-)|
collections/[^/]+/(?P<collection_display_id>[^/]+)
)
'''
_TESTS = [{
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
'info_dict': {
'id': '22253814',
'ext': 'mp4',
'title': 'Winter is Coming',
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
'timestamp': 1338306241,
'upload_date': '20120529',
'uploader': 'AENE-NEW',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['ThePlatform'],
}, {
'url': 'http://www.history.com/shows/ancient-aliens/season-1',
'info_dict': {
'id': '71889446852',
},
'playlist_mincount': 5,
}, {
'url': 'http://www.mylifetime.com/shows/atlanta-plastic',
'info_dict': {
'id': 'SERIES4317',
'title': 'Atlanta Plastic',
},
'playlist_mincount': 2,
}, {
'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
'only_matching': True
}, {
'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
'only_matching': True
}, {
'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6',
'only_matching': True
}, {
'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie',
'only_matching': True
}, {
'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us',
'only_matching': True
}, {
'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special',
'only_matching': True
}, {
'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward',
'only_matching': True
}, {
'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story',
'only_matching': True
}]
_DOMAIN_TO_REQUESTOR_ID = {
'history.com': 'HISTORY',
'aetv.com': 'AETV',
'mylifetime.com': 'LIFETIME',
'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB',
'fyi.tv': 'FYI',
}
def _real_extract(self, url):
domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups()
display_id = show_path or movie_display_id or special_display_id or collection_display_id
webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers())
if show_path:
url_parts = show_path.split('/')
url_parts_len = len(url_parts)
if url_parts_len == 1:
entries = []
for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
entries.append(self.url_result(
compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
if entries:
return self.playlist_result(
entries, self._html_search_meta('aetn:SeriesId', webpage),
self._html_search_meta('aetn:SeriesTitle', webpage))
else:
# single season
url_parts_len = 2
if url_parts_len == 2:
entries = []
for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage):
episode_attributes = extract_attributes(episode_item)
episode_url = compat_urlparse.urljoin(
url, episode_attributes['data-canonical'])
entries.append(self.url_result(
episode_url, 'AENetworks',
episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id')))
return self.playlist_result(
entries, self._html_search_meta('aetn:SeasonId', webpage))
video_id = self._html_search_meta('aetn:VideoID', webpage)
media_url = self._search_regex(
[r"media_url\s*=\s*'(?P<url>[^']+)'",
r'data-media-url=(?P<url>(?:https?:)?//[^\s>]+)',
r'data-media-url=(["\'])(?P<url>(?:(?!\1).)+?)\1'],
webpage, 'video url', group='url')
theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
info = self._parse_theplatform_metadata(theplatform_metadata)
auth = None
if theplatform_metadata.get('AETN$isBehindWall'):
requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain]
resource = self._get_mvpd_resource(
requestor_id, theplatform_metadata['title'],
theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'),
theplatform_metadata['ratings'][0]['rating'])
auth = self._extract_mvpd_auth(
url, video_id, requestor_id, resource)
info.update(self._search_json_ld(webpage, video_id, fatal=False))
info.update(self._extract_aen_smil(media_url, video_id, auth))
return info
class HistoryTopicIE(AENetworksBaseIE):
IE_NAME = 'history:topic'
IE_DESC = 'History.com Topic'
_VALID_URL = r'https?://(?:www\.)?history\.com/topics/[^/]+/(?P<id>[\w+-]+?)-video'
_TESTS = [{
'url': 'https://www.history.com/topics/valentines-day/history-of-valentines-day-video',
'info_dict': {
'id': '40700995724',
'ext': 'mp4',
'title': "History of Valentines Day",
'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
'timestamp': 1375819729,
'upload_date': '20130806',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['ThePlatform'],
}]
def theplatform_url_result(self, theplatform_url, video_id, query):
return {
'_type': 'url_transparent',
'id': video_id,
'url': smuggle_url(
update_url_query(theplatform_url, query),
{
'sig': {
'key': self._THEPLATFORM_KEY,
'secret': self._THEPLATFORM_SECRET,
},
'force_smil_url': True
}),
'ie_key': 'ThePlatform',
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'<phoenix-iframe[^>]+src="[^"]+\btpid=(\d+)', webpage, 'tpid')
result = self._download_json(
'https://feeds.video.aetnd.com/api/v2/history/videos',
video_id, query={'filter[id]': video_id})['results'][0]
title = result['title']
info = self._extract_aen_smil(result['publicUrl'], video_id)
info.update({
'title': title,
'description': result.get('description'),
'duration': int_or_none(result.get('duration')),
'timestamp': int_or_none(result.get('added'), 1000),
})
return info

View File

@@ -0,0 +1,367 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_xpath
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
url_or_none,
urlencode_postdata,
xpath_text,
)
class AfreecaTVIE(InfoExtractor):
IE_NAME = 'afreecatv'
IE_DESC = 'afreecatv.com'
_VALID_URL = r'''(?x)
https?://
(?:
(?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)?
(?:
/app/(?:index|read_ucc_bbs)\.cgi|
/player/[Pp]layer\.(?:swf|html)
)\?.*?\bnTitleNo=|
vod\.afreecatv\.com/PLAYER/STATION/
)
(?P<id>\d+)
'''
_NETRC_MACHINE = 'afreecatv'
_TESTS = [{
'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',
'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
'info_dict': {
'id': '36164052',
'ext': 'mp4',
'title': '데일리 에이프릴 요정들의 시상식!',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160503',
},
'skip': 'Video is gone',
}, {
'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867',
'info_dict': {
'id': '36153164',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
},
'playlist_count': 2,
'playlist': [{
'md5': 'd8b7c174568da61d774ef0203159bf97',
'info_dict': {
'id': '36153164_1',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'upload_date': '20160502',
},
}, {
'md5': '58f2ce7f6044e34439ab2d50612ab02b',
'info_dict': {
'id': '36153164_2',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'upload_date': '20160502',
},
}],
'skip': 'Video is gone',
}, {
'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793',
'info_dict': {
'id': '18650793',
'ext': 'mp4',
'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '윈아디',
'uploader_id': 'badkids',
'duration': 107,
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652',
'info_dict': {
'id': '10481652',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'duration': 6492,
},
'playlist_count': 2,
'playlist': [{
'md5': 'd8b7c174568da61d774ef0203159bf97',
'info_dict': {
'id': '20160502_c4c62b9d_174361386_1',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)",
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160502',
'duration': 3601,
},
}, {
'md5': '58f2ce7f6044e34439ab2d50612ab02b',
'info_dict': {
'id': '20160502_39e739bb_174361386_2',
'ext': 'mp4',
'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)",
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160502',
'duration': 2891,
},
}],
'params': {
'skip_download': True,
},
}, {
# non standard key
'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605',
'info_dict': {
'id': '20170411_BE689A0E_190960999_1_2_h',
'ext': 'mp4',
'title': '혼자사는여자집',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': '♥이슬이',
'uploader_id': 'dasl8121',
'upload_date': '20170411',
'duration': 213,
},
'params': {
'skip_download': True,
},
}, {
# PARTIAL_ADULT
'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439',
'info_dict': {
'id': '20180327_27901457_202289533_1',
'ext': 'mp4',
'title': '[생]빨개요♥ (part 1)',
'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
'uploader': '[SA]서아',
'uploader_id': 'bjdyrksu',
'upload_date': '20180327',
'duration': 3601,
},
'params': {
'skip_download': True,
},
'expected_warnings': ['adult content'],
}, {
'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
'only_matching': True,
}, {
'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030',
'only_matching': True,
}]
@staticmethod
def parse_video_key(key):
video_key = {}
m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key)
if m:
video_key['upload_date'] = m.group('upload_date')
video_key['part'] = int(m.group('part'))
return video_key
def _real_initialize(self):
self._login()
def _login(self):
username, password = self._get_login_info()
if username is None:
return
login_form = {
'szWork': 'login',
'szType': 'json',
'szUid': username,
'szPassword': password,
'isSaveId': 'false',
'szScriptVar': 'oLoginRet',
'szAction': '',
}
response = self._download_json(
'https://login.afreecatv.com/app/LoginAction.php', None,
'Logging in', data=urlencode_postdata(login_form))
_ERRORS = {
-4: 'Your account has been suspended due to a violation of our terms and policies.',
-5: 'https://member.afreecatv.com/app/user_delete_progress.php',
-6: 'https://login.afreecatv.com/membership/changeMember.php',
-8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
-9: 'https://member.afreecatv.com/app/pop_login_block.php',
-11: 'https://login.afreecatv.com/afreeca/second_login.php',
-12: 'https://member.afreecatv.com/app/user_security.php',
0: 'The username does not exist or you have entered the wrong password.',
-1: 'The username does not exist or you have entered the wrong password.',
-3: 'You have entered your username/password incorrectly.',
-7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.',
-10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.',
-32008: 'You have failed to log in. Please contact our Help Center.',
}
result = int_or_none(response.get('RESULT'))
if result != 1:
error = _ERRORS.get(result, 'You have failed to log in.')
raise ExtractorError(
'Unable to login: %s said: %s' % (self.IE_NAME, error),
expected=True)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
if re.search(r'alert\(["\']This video has been deleted', webpage):
raise ExtractorError(
'Video %s has been deleted' % video_id, expected=True)
station_id = self._search_regex(
r'nStationNo\s*=\s*(\d+)', webpage, 'station')
bbs_id = self._search_regex(
r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs')
video_id = self._search_regex(
r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id)
partial_view = False
for _ in range(2):
query = {
'nTitleNo': video_id,
'nStationNo': station_id,
'nBbsNo': bbs_id,
}
if partial_view:
query['partialView'] = 'SKIP_ADULT'
video_xml = self._download_xml(
'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php',
video_id, 'Downloading video info XML%s'
% (' (skipping adult)' if partial_view else ''),
video_id, headers={
'Referer': url,
}, query=query)
flag = xpath_text(video_xml, './track/flag', 'flag', default=None)
if flag and flag == 'SUCCEED':
break
if flag == 'PARTIAL_ADULT':
self._downloader.report_warning(
'In accordance with local laws and regulations, underage users are restricted from watching adult content. '
'Only content suitable for all ages will be downloaded. '
'Provide account credentials if you wish to download restricted content.')
partial_view = True
continue
elif flag == 'ADULT':
error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.'
else:
error = flag
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, error), expected=True)
else:
raise ExtractorError('Unable to download video info')
video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
if video_element is None or video_element.text is None:
raise ExtractorError(
'Video %s video does not exist' % video_id, expected=True)
video_url = video_element.text.strip()
title = xpath_text(video_xml, './track/title', 'title', fatal=True)
uploader = xpath_text(video_xml, './track/nickname', 'uploader')
uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
duration = int_or_none(xpath_text(
video_xml, './track/duration', 'duration'))
thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
common_entry = {
'uploader': uploader,
'uploader_id': uploader_id,
'thumbnail': thumbnail,
}
info = common_entry.copy()
info.update({
'id': video_id,
'title': title,
'duration': duration,
})
if not video_url:
entries = []
file_elements = video_element.findall(compat_xpath('./file'))
one = len(file_elements) == 1
for file_num, file_element in enumerate(file_elements, start=1):
file_url = url_or_none(file_element.text)
if not file_url:
continue
key = file_element.get('key', '')
upload_date = self._search_regex(
r'^(\d{8})_', key, 'upload date', default=None)
file_duration = int_or_none(file_element.get('duration'))
format_id = key if key else '%s_%s' % (video_id, file_num)
if determine_ext(file_url) == 'm3u8':
formats = self._extract_m3u8_formats(
file_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls',
note='Downloading part %d m3u8 information' % file_num)
else:
formats = [{
'url': file_url,
'format_id': 'http',
}]
if not formats:
continue
self._sort_formats(formats)
file_info = common_entry.copy()
file_info.update({
'id': format_id,
'title': title if one else '%s (part %d)' % (title, file_num),
'upload_date': upload_date,
'duration': file_duration,
'formats': formats,
})
entries.append(file_info)
entries_info = info.copy()
entries_info.update({
'_type': 'multi_video',
'entries': entries,
})
return entries_info
info = {
'id': video_id,
'title': title,
'uploader': uploader,
'uploader_id': uploader_id,
'duration': duration,
'thumbnail': thumbnail,
}
if determine_ext(video_url) == 'm3u8':
info['formats'] = self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
else:
app, playpath = video_url.split('mp4:')
info.update({
'url': app,
'ext': 'flv',
'play_path': 'mp4:' + playpath,
'rtmp_live': True, # downloading won't end without this
})
return info

View File

@@ -0,0 +1,66 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
)
class AirMozillaIE(InfoExtractor):
_VALID_URL = r'https?://air\.mozilla\.org/(?P<id>[0-9a-z-]+)/?'
_TEST = {
'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/',
'md5': '8d02f53ee39cf006009180e21df1f3ba',
'info_dict': {
'id': '6x4q2w',
'ext': 'mp4',
'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco',
'thumbnail': r're:https?://.*/poster\.jpg',
'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...',
'timestamp': 1422487800,
'upload_date': '20150128',
'location': 'SFO Commons',
'duration': 3780,
'view_count': int,
'categories': ['Main', 'Privacy'],
}
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(r'//vid\.ly/(.*?)/embed', webpage, 'id')
embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id)
jwconfig = self._parse_json(self._search_regex(
r'initCallback\((.*)\);', embed_script, 'metadata'), video_id)['config']
info_dict = self._parse_jwplayer_data(jwconfig, video_id)
view_count = int_or_none(self._html_search_regex(
r'Views since archived: ([0-9]+)',
webpage, 'view count', fatal=False))
timestamp = parse_iso8601(self._html_search_regex(
r'<time datetime="(.*?)"', webpage, 'timestamp', fatal=False))
duration = parse_duration(self._search_regex(
r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)',
webpage, 'duration', fatal=False))
info_dict.update({
'id': video_id,
'title': self._og_search_title(webpage),
'url': self._og_search_url(webpage),
'display_id': display_id,
'description': self._og_search_description(webpage),
'timestamp': timestamp,
'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None),
'duration': duration,
'view_count': view_count,
'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage),
})
return info_dict

View File

@@ -0,0 +1,53 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
float_or_none,
try_get,
)
class AliExpressLiveIE(InfoExtractor):
_VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>\d+)'
_TEST = {
'url': 'https://live.aliexpress.com/live/2800002704436634',
'md5': 'e729e25d47c5e557f2630eaf99b740a5',
'info_dict': {
'id': '2800002704436634',
'ext': 'mp4',
'title': 'CASIMA7.22',
'thumbnail': r're:http://.*\.jpg',
'uploader': 'CASIMA Official Store',
'timestamp': 1500717600,
'upload_date': '20170722',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._parse_json(
self._search_regex(
r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var',
webpage, 'runParams'),
video_id)
title = data['title']
formats = self._extract_m3u8_formats(
data['replyStreamUrl'], video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls')
return {
'id': video_id,
'title': title,
'thumbnail': data.get('coverUrl'),
'uploader': try_get(
data, lambda x: x['followBar']['name'], compat_str),
'timestamp': float_or_none(data.get('startTimeLong'), scale=1000),
'formats': formats,
}

View File

@@ -0,0 +1,33 @@
from __future__ import unicode_literals
from .common import InfoExtractor
class AlJazeeraIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html'
_TESTS = [{
'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
'info_dict': {
'id': '3792260579001',
'ext': 'mp4',
'title': 'The Slum - Episode 1: Deliverance',
'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.',
'uploader_id': '665003303001',
'timestamp': 1411116829,
'upload_date': '20140919',
},
'add_ie': ['BrightcoveNew'],
'skip': 'Not accessible from Travis CI server',
}, {
'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html',
'only_matching': True,
}]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
program_name = self._match_id(url)
webpage = self._download_webpage(url, program_name)
brightcove_id = self._search_regex(
r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id')
return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)

View File

@@ -0,0 +1,132 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
qualities,
remove_end,
try_get,
unified_timestamp,
url_basename,
)
class AllocineIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?:article|video|film)/(?:fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=|video-)(?P<id>[0-9]+)(?:\.html)?'
_TESTS = [{
'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html',
'md5': '0c9fcf59a841f65635fa300ac43d8269',
'info_dict': {
'id': '19546517',
'display_id': '18635087',
'ext': 'mp4',
'title': 'Astérix - Le Domaine des Dieux Teaser VF',
'description': 'md5:4a754271d9c6f16c72629a8a993ee884',
'thumbnail': r're:http://.*\.jpg',
'duration': 39,
'timestamp': 1404273600,
'upload_date': '20140702',
'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html',
'md5': 'd0cdce5d2b9522ce279fdfec07ff16e0',
'info_dict': {
'id': '19540403',
'display_id': '19540403',
'ext': 'mp4',
'title': 'Planes 2 Bande-annonce VF',
'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway',
'thumbnail': r're:http://.*\.jpg',
'duration': 69,
'timestamp': 1385659800,
'upload_date': '20131128',
'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/player_gen_cmedia=19544709&cfilm=181290.html',
'md5': '101250fb127ef9ca3d73186ff22a47ce',
'info_dict': {
'id': '19544709',
'display_id': '19544709',
'ext': 'mp4',
'title': 'Dragons 2 - Bande annonce finale VF',
'description': 'md5:6cdd2d7c2687d4c6aafe80a35e17267a',
'thumbnail': r're:http://.*\.jpg',
'duration': 144,
'timestamp': 1397589900,
'upload_date': '20140415',
'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/video-19550147/',
'md5': '3566c0668c0235e2d224fd8edb389f67',
'info_dict': {
'id': '19550147',
'ext': 'mp4',
'title': 'Faux Raccord N°123 - Les gaffes de Cliffhanger',
'description': 'md5:bc734b83ffa2d8a12188d9eb48bb6354',
'thumbnail': r're:http://.*\.jpg',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
formats = []
quality = qualities(['ld', 'md', 'hd'])
model = self._html_search_regex(
r'data-model="([^"]+)"', webpage, 'data model', default=None)
if model:
model_data = self._parse_json(model, display_id)
video = model_data['videos'][0]
title = video['title']
for video_url in video['sources'].values():
video_id, format_id = url_basename(video_url).split('_')[:2]
formats.append({
'format_id': format_id,
'quality': quality(format_id),
'url': video_url,
})
duration = int_or_none(video.get('duration'))
view_count = int_or_none(video.get('view_count'))
timestamp = unified_timestamp(try_get(
video, lambda x: x['added_at']['date'], compat_str))
else:
video_id = display_id
media_data = self._download_json(
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
title = remove_end(
self._html_search_regex(
r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
' - AlloCiné')
for key, value in media_data['video'].items():
if not key.endswith('Path'):
continue
format_id = key[:-len('Path')]
formats.append({
'format_id': format_id,
'quality': quality(format_id),
'url': value,
})
duration, view_count, timestamp = [None] * 3
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'duration': duration,
'timestamp': timestamp,
'view_count': view_count,
'formats': formats,
}

View File

@@ -0,0 +1,77 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
parse_iso8601,
parse_duration,
parse_filesize,
int_or_none,
)
class AlphaPornoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?alphaporno\.com/videos/(?P<id>[^/]+)'
_TEST = {
'url': 'http://www.alphaporno.com/videos/sensual-striptease-porn-with-samantha-alexandra/',
'md5': 'feb6d3bba8848cd54467a87ad34bd38e',
'info_dict': {
'id': '258807',
'display_id': 'sensual-striptease-porn-with-samantha-alexandra',
'ext': 'mp4',
'title': 'Sensual striptease porn with Samantha Alexandra',
'thumbnail': r're:https?://.*\.jpg$',
'timestamp': 1418694611,
'upload_date': '20141216',
'duration': 387,
'filesize_approx': 54120000,
'tbr': 1145,
'categories': list,
'age_limit': 18,
}
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r"video_id\s*:\s*'([^']+)'", webpage, 'video id', default=None)
video_url = self._search_regex(
r"video_url\s*:\s*'([^']+)'", webpage, 'video url')
ext = self._html_search_meta(
'encodingFormat', webpage, 'ext', default='.mp4')[1:]
title = self._search_regex(
[r'<meta content="([^"]+)" itemprop="description">',
r'class="title" itemprop="name">([^<]+)<'],
webpage, 'title')
thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail')
timestamp = parse_iso8601(self._html_search_meta(
'uploadDate', webpage, 'upload date'))
duration = parse_duration(self._html_search_meta(
'duration', webpage, 'duration'))
filesize_approx = parse_filesize(self._html_search_meta(
'contentSize', webpage, 'file size'))
bitrate = int_or_none(self._html_search_meta(
'bitrate', webpage, 'bitrate'))
categories = self._html_search_meta(
'keywords', webpage, 'categories', default='').split(',')
age_limit = self._rta_search(webpage)
return {
'id': video_id,
'display_id': display_id,
'url': video_url,
'ext': ext,
'title': title,
'thumbnail': thumbnail,
'timestamp': timestamp,
'duration': duration,
'filesize_approx': filesize_approx,
'tbr': bitrate,
'categories': categories,
'age_limit': age_limit,
}

View File

@@ -0,0 +1,118 @@
# coding: utf-8
from __future__ import unicode_literals
from .theplatform import ThePlatformIE
from ..utils import (
int_or_none,
parse_age_limit,
try_get,
update_url_query,
)
class AMCNetworksIE(ThePlatformIE):
_VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1',
'md5': '',
'info_dict': {
'id': 's3MX01Nl4vPH',
'ext': 'mp4',
'title': 'Maron - Season 4 - Step 1',
'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.',
'age_limit': 17,
'upload_date': '20160505',
'timestamp': 1462468831,
'uploader': 'AMCN',
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Requires TV provider accounts',
}, {
'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge',
'only_matching': True,
}, {
'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot',
'only_matching': True,
}, {
'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal',
'only_matching': True,
}, {
'url': 'http://www.ifc.com/movies/chaos',
'only_matching': True,
}, {
'url': 'http://www.bbcamerica.com/shows/doctor-who/full-episodes/the-power-of-the-daleks/episode-01-episode-1-color-version',
'only_matching': True,
}, {
'url': 'http://www.wetv.com/shows/mama-june-from-not-to-hot/full-episode/season-01/thin-tervention',
'only_matching': True,
}, {
'url': 'http://www.wetv.com/shows/la-hair/videos/season-05/episode-09-episode-9-2/episode-9-sneak-peek-3',
'only_matching': True,
}, {
'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
query = {
'mbr': 'true',
'manifest': 'm3u',
}
media_url = self._search_regex(
r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)',
webpage, 'media url')
theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
r'link\.theplatform\.com/s/([^?]+)',
media_url, 'theplatform_path'), display_id)
info = self._parse_theplatform_metadata(theplatform_metadata)
video_id = theplatform_metadata['pid']
title = theplatform_metadata['title']
rating = try_get(
theplatform_metadata, lambda x: x['ratings'][0]['rating'])
auth_required = self._search_regex(
r'window\.authRequired\s*=\s*(true|false);',
webpage, 'auth required')
if auth_required == 'true':
requestor_id = self._search_regex(
r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)',
webpage, 'requestor id')
resource = self._get_mvpd_resource(
requestor_id, title, video_id, rating)
query['auth'] = self._extract_mvpd_auth(
url, video_id, requestor_id, resource)
media_url = update_url_query(media_url, query)
formats, subtitles = self._extract_theplatform_smil(
media_url, video_id)
self._sort_formats(formats)
info.update({
'id': video_id,
'subtitles': subtitles,
'formats': formats,
'age_limit': parse_age_limit(parse_age_limit(rating)),
})
ns_keys = theplatform_metadata.get('$xmlns', {}).keys()
if ns_keys:
ns = list(ns_keys)[0]
series = theplatform_metadata.get(ns + '$show')
season_number = int_or_none(
theplatform_metadata.get(ns + '$season'))
episode = theplatform_metadata.get(ns + '$episodeTitle')
episode_number = int_or_none(
theplatform_metadata.get(ns + '$episode'))
if season_number:
title = 'Season %d - %s' % (season_number, title)
if series:
title = '%s - %s' % (series, title)
info.update({
'title': title,
'series': series,
'season_number': season_number,
'episode': episode,
'episode_number': episode_number,
})
return info

View File

@@ -0,0 +1,82 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
int_or_none,
js_to_json,
try_get,
unified_strdate,
)
class AmericasTestKitchenIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers',
'md5': 'b861c3e365ac38ad319cfd509c30577f',
'info_dict': {
'id': '5b400b9ee338f922cb06450c',
'title': 'Weeknight Japanese Suppers',
'ext': 'mp4',
'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8',
'thumbnail': r're:^https?://',
'timestamp': 1523664000,
'upload_date': '20180414',
'release_date': '20180414',
'series': "America's Test Kitchen",
'season_number': 18,
'episode': 'Weeknight Japanese Suppers',
'episode_number': 15,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_data = self._parse_json(
self._search_regex(
r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*</script>',
webpage, 'initial context'),
video_id, js_to_json)
ep_data = try_get(
video_data,
(lambda x: x['episodeDetail']['content']['data'],
lambda x: x['videoDetail']['content']['data']), dict)
ep_meta = ep_data.get('full_video', {})
zype_id = ep_data.get('zype_id') or ep_meta['zype_id']
title = ep_data.get('title') or ep_meta.get('title')
description = clean_html(ep_meta.get('episode_description') or ep_data.get(
'description') or ep_meta.get('description'))
thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url'])
release_date = unified_strdate(ep_data.get('aired_at'))
season_number = int_or_none(ep_meta.get('season_number'))
episode = ep_meta.get('title')
episode_number = int_or_none(ep_meta.get('episode_number'))
return {
'_type': 'url_transparent',
'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id,
'ie_key': 'Zype',
'title': title,
'description': description,
'thumbnail': thumbnail,
'release_date': release_date,
'series': "America's Test Kitchen",
'season_number': season_number,
'episode': episode,
'episode_number': episode_number,
}

View File

@@ -0,0 +1,102 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
mimetype2ext,
parse_iso8601,
url_or_none,
)
class AMPIE(InfoExtractor):
# parse Akamai Adaptive Media Player feed
def _extract_feed_info(self, url):
feed = self._download_json(
url, None, 'Downloading Akamai AMP feed',
'Unable to download Akamai AMP feed')
item = feed.get('channel', {}).get('item')
if not item:
raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error']))
video_id = item['guid']
def get_media_node(name, default=None):
media_name = 'media-%s' % name
media_group = item.get('media-group') or item
return media_group.get(media_name) or item.get(media_name) or item.get(name, default)
thumbnails = []
media_thumbnail = get_media_node('thumbnail')
if media_thumbnail:
if isinstance(media_thumbnail, dict):
media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail:
thumbnail = thumbnail_data.get('@attributes', {})
thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url:
continue
thumbnails.append({
'url': self._proto_relative_url(thumbnail_url, 'http:'),
'width': int_or_none(thumbnail.get('width')),
'height': int_or_none(thumbnail.get('height')),
})
subtitles = {}
media_subtitle = get_media_node('subTitle')
if media_subtitle:
if isinstance(media_subtitle, dict):
media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle:
subtitle = subtitle_data.get('@attributes', {})
subtitle_href = url_or_none(subtitle.get('href'))
if not subtitle_href:
continue
subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
'url': subtitle_href,
'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href),
})
formats = []
media_content = get_media_node('content')
if isinstance(media_content, dict):
media_content = [media_content]
for media_data in media_content:
media = media_data.get('@attributes', {})
media_url = url_or_none(media.get('url'))
if not media_url:
continue
ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
video_id, f4m_id='hds', fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
else:
formats.append({
'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
'url': media_url,
'tbr': int_or_none(media.get('bitrate')),
'filesize': int_or_none(media.get('fileSize')),
'ext': ext,
})
self._sort_formats(formats)
timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date'))
return {
'id': video_id,
'title': get_media_node('title'),
'description': get_media_node('description'),
'thumbnails': thumbnails,
'timestamp': timestamp,
'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')),
'subtitles': subtitles,
'formats': formats,
}

View File

@@ -0,0 +1,293 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
extract_attributes,
ExtractorError,
url_or_none,
urlencode_postdata,
urljoin,
)
class AnimeOnDemandIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P<id>\d+)'
_LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in'
_APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
_NETRC_MACHINE = 'animeondemand'
# German-speaking countries of Europe
_GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU']
_TESTS = [{
# jap, OmU
'url': 'https://www.anime-on-demand.de/anime/161',
'info_dict': {
'id': '161',
'title': 'Grimgar, Ashes and Illusions (OmU)',
'description': 'md5:6681ce3c07c7189d255ac6ab23812d31',
},
'playlist_mincount': 4,
}, {
# Film wording is used instead of Episode, ger/jap, Dub/OmU
'url': 'https://www.anime-on-demand.de/anime/39',
'only_matching': True,
}, {
# Episodes without titles, jap, OmU
'url': 'https://www.anime-on-demand.de/anime/162',
'only_matching': True,
}, {
# ger/jap, Dub/OmU, account required
'url': 'https://www.anime-on-demand.de/anime/169',
'only_matching': True,
}, {
# Full length film, non-series, ger/jap, Dub/OmU, account required
'url': 'https://www.anime-on-demand.de/anime/185',
'only_matching': True,
}, {
# Flash videos
'url': 'https://www.anime-on-demand.de/anime/12',
'only_matching': True,
}]
def _login(self):
username, password = self._get_login_info()
if username is None:
return
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page:
self.raise_geo_restricted(
'%s is only available in German-speaking countries of Europe' % self.IE_NAME)
login_form = self._form_hidden_inputs('new_user', login_page)
login_form.update({
'user[login]': username,
'user[password]': password,
})
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
'post url', default=self._LOGIN_URL, group='url')
if not post_url.startswith('http'):
post_url = urljoin(self._LOGIN_URL, post_url)
response = self._download_webpage(
post_url, None, 'Logging in',
data=urlencode_postdata(login_form), headers={
'Referer': self._LOGIN_URL,
})
if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')):
error = self._search_regex(
r'<p[^>]+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</p>',
response, 'error', default=None, group='error')
if error:
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
def _real_initialize(self):
self._login()
def _real_extract(self, url):
anime_id = self._match_id(url)
webpage = self._download_webpage(url, anime_id)
if 'data-playlist=' not in webpage:
self._download_webpage(
self._APPLY_HTML5_URL, anime_id,
'Activating HTML5 beta', 'Unable to apply HTML5 beta')
webpage = self._download_webpage(url, anime_id)
csrf_token = self._html_search_meta(
'csrf-token', webpage, 'csrf token', fatal=True)
anime_title = self._html_search_regex(
r'(?s)<h1[^>]+itemprop="name"[^>]*>(.+?)</h1>',
webpage, 'anime name')
anime_description = self._html_search_regex(
r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>',
webpage, 'anime description', default=None)
entries = []
def extract_info(html, video_id, num=None):
title, description = [None] * 2
formats = []
for input_ in re.findall(
r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html):
attributes = extract_attributes(input_)
title = attributes.get('data-dialog-header')
playlist_urls = []
for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'):
playlist_url = attributes.get(playlist_key)
if isinstance(playlist_url, compat_str) and re.match(
r'/?[\da-zA-Z]+', playlist_url):
playlist_urls.append(attributes[playlist_key])
if not playlist_urls:
continue
lang = attributes.get('data-lang')
lang_note = attributes.get('value')
for playlist_url in playlist_urls:
kind = self._search_regex(
r'videomaterialurl/\d+/([^/]+)/',
playlist_url, 'media kind', default=None)
format_id_list = []
if lang:
format_id_list.append(lang)
if kind:
format_id_list.append(kind)
if not format_id_list and num is not None:
format_id_list.append(compat_str(num))
format_id = '-'.join(format_id_list)
format_note = ', '.join(filter(None, (kind, lang_note)))
item_id_list = []
if format_id:
item_id_list.append(format_id)
item_id_list.append('videomaterial')
playlist = self._download_json(
urljoin(url, playlist_url), video_id,
'Downloading %s JSON' % ' '.join(item_id_list),
headers={
'X-Requested-With': 'XMLHttpRequest',
'X-CSRF-Token': csrf_token,
'Referer': url,
'Accept': 'application/json, text/javascript, */*; q=0.01',
}, fatal=False)
if not playlist:
continue
stream_url = url_or_none(playlist.get('streamurl'))
if stream_url:
rtmp = re.search(
r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',
stream_url)
if rtmp:
formats.append({
'url': rtmp.group('url'),
'app': rtmp.group('app'),
'play_path': rtmp.group('playpath'),
'page_url': url,
'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf',
'rtmp_real_time': True,
'format_id': 'rtmp',
'ext': 'flv',
})
continue
start_video = playlist.get('startvideo', 0)
playlist = playlist.get('playlist')
if not playlist or not isinstance(playlist, list):
continue
playlist = playlist[start_video]
title = playlist.get('title')
if not title:
continue
description = playlist.get('description')
for source in playlist.get('sources', []):
file_ = source.get('file')
if not file_:
continue
ext = determine_ext(file_)
format_id_list = [lang, kind]
if ext == 'm3u8':
format_id_list.append('hls')
elif source.get('type') == 'video/dash' or ext == 'mpd':
format_id_list.append('dash')
format_id = '-'.join(filter(None, format_id_list))
if ext == 'm3u8':
file_formats = self._extract_m3u8_formats(
file_, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)
elif source.get('type') == 'video/dash' or ext == 'mpd':
continue
file_formats = self._extract_mpd_formats(
file_, video_id, mpd_id=format_id, fatal=False)
else:
continue
for f in file_formats:
f.update({
'language': lang,
'format_note': format_note,
})
formats.extend(file_formats)
return {
'title': title,
'description': description,
'formats': formats,
}
def extract_entries(html, video_id, common_info, num=None):
info = extract_info(html, video_id, num)
if info['formats']:
self._sort_formats(info['formats'])
f = common_info.copy()
f.update(info)
entries.append(f)
# Extract teaser/trailer only when full episode is not available
if not info['formats']:
m = re.search(
r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<',
html)
if m:
f = common_info.copy()
f.update({
'id': '%s-%s' % (f['id'], m.group('kind').lower()),
'title': m.group('title'),
'url': urljoin(url, m.group('href')),
})
entries.append(f)
def extract_episodes(html):
for num, episode_html in enumerate(re.findall(
r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1):
episodebox_title = self._search_regex(
(r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
episode_html, 'episodebox title', default=None, group='title')
if not episodebox_title:
continue
episode_number = int(self._search_regex(
r'(?:Episode|Film)\s*(\d+)',
episodebox_title, 'episode number', default=num))
episode_title = self._search_regex(
r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
episodebox_title, 'episode title', default=None)
video_id = 'episode-%d' % episode_number
common_info = {
'id': video_id,
'series': anime_title,
'episode': episode_title,
'episode_number': episode_number,
}
extract_entries(episode_html, video_id, common_info)
def extract_film(html, video_id):
common_info = {
'id': anime_id,
'title': anime_title,
'description': anime_description,
}
extract_entries(html, video_id, common_info)
extract_episodes(webpage)
if not entries:
extract_film(webpage, anime_id)
return self.playlist_result(entries, anime_id, anime_title, anime_description)

View File

@@ -0,0 +1,314 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
import hashlib
import json
import random
import re
import time
from .common import InfoExtractor
from ..aes import aes_encrypt
from ..compat import compat_str
from ..utils import (
bytes_to_intlist,
determine_ext,
intlist_to_bytes,
int_or_none,
strip_jsonp,
unescapeHTML,
unsmuggle_url,
)
def md5_text(s):
if not isinstance(s, compat_str):
s = compat_str(s)
return hashlib.md5(s.encode('utf-8')).hexdigest()
class AnvatoIE(InfoExtractor):
_VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
# Copied from anvplayer.min.js
_ANVACK_TABLE = {
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA',
'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP',
'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv',
'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7',
'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR',
'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg',
'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto',
'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY',
'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh',
'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK',
'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D',
'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad',
'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp',
'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih',
'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR',
'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW',
'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su',
'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q',
'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5',
'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3',
'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI',
'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s',
'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz',
'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg',
'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x',
'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH',
'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX',
'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc',
'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK',
'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7',
'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C',
'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e',
'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1',
'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re',
'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51',
'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho',
'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9',
'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH',
'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F',
'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo',
'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR',
'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa',
'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk',
'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ',
'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ',
'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m',
'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b',
'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3',
'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK',
'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F',
'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx',
'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ',
'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH',
'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm',
'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt',
'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl',
'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b',
'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV',
'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg',
'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk',
'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT',
'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa',
'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv',
'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k',
'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI',
'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr',
'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw',
'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K',
'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH',
'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK',
'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu',
'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK',
'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n',
'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD',
'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk',
'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn',
'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W',
'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ',
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
}
_MCP_TO_ACCESS_KEY_TABLE = {
'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922',
'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749',
'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a',
'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3',
'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
}
_API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA'
_ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
_TESTS = [{
# from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874
'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496',
'info_dict': {
'id': '4465496',
'ext': 'mp4',
'title': 'VIDEO: Humpback whale breaches right next to NH boat',
'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.',
'duration': 22,
'timestamp': 1534855680,
'upload_date': '20180821',
'uploader': 'ANV',
},
'params': {
'skip_download': True,
},
}, {
# from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/
'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601',
'only_matching': True,
}]
def __init__(self, *args, **kwargs):
super(AnvatoIE, self).__init__(*args, **kwargs)
self.__server_time = None
def _server_time(self, access_key, video_id):
if self.__server_time is not None:
return self.__server_time
self.__server_time = int(self._download_json(
self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id,
note='Fetching server time')['server_time'])
return self.__server_time
def _api_prefix(self, access_key):
return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage')
def _get_video_json(self, access_key, video_id):
# See et() in anvplayer.min.js, which is an alias of getVideoJSON()
video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key)
server_time = self._server_time(access_key, video_id)
input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time))
auth_secret = intlist_to_bytes(aes_encrypt(
bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii')
anvrid = md5_text(time.time() * 1000 * random.random())[:30]
payload = {
'api': {
'anvrid': anvrid,
'anvstk': md5_text('%s|%s|%d|%s' % (
access_key, anvrid, server_time,
self._ANVACK_TABLE.get(access_key, self._API_KEY))),
'anvts': server_time,
},
}
return self._download_json(
video_data_url, video_id, transform_source=strip_jsonp,
data=json.dumps(payload).encode('utf-8'))
def _get_anvato_videos(self, access_key, video_id):
video_data = self._get_video_json(access_key, video_id)
formats = []
for published_url in video_data['published_urls']:
video_url = published_url['embed_url']
media_format = published_url.get('format')
ext = determine_ext(video_url)
if ext == 'smil' or media_format == 'smil':
formats.extend(self._extract_smil_formats(video_url, video_id))
continue
tbr = int_or_none(published_url.get('kbps'))
a_format = {
'url': video_url,
'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(),
'tbr': tbr if tbr != 0 else None,
}
if media_format == 'm3u8' and tbr is not None:
a_format.update({
'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
'ext': 'mp4',
})
elif media_format == 'm3u8-variant' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
continue
elif ext == 'mp3' or media_format == 'mp3':
a_format['vcodec'] = 'none'
else:
a_format.update({
'width': int_or_none(published_url.get('width')),
'height': int_or_none(published_url.get('height')),
})
formats.append(a_format)
self._sort_formats(formats)
subtitles = {}
for caption in video_data.get('captions', []):
a_caption = {
'url': caption['url'],
'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None
}
subtitles.setdefault(caption['language'], []).append(a_caption)
return {
'id': video_id,
'formats': formats,
'title': video_data.get('def_title'),
'description': video_data.get('def_description'),
'tags': video_data.get('def_tags', '').split(','),
'categories': video_data.get('categories'),
'thumbnail': video_data.get('thumbnail'),
'timestamp': int_or_none(video_data.get(
'ts_published') or video_data.get('ts_added')),
'uploader': video_data.get('mcp_id'),
'duration': int_or_none(video_data.get('duration')),
'subtitles': subtitles,
}
@staticmethod
def _extract_urls(ie, webpage, video_id):
entries = []
for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
anvplayer_data = ie._parse_json(
mobj.group('anvp'), video_id, transform_source=unescapeHTML,
fatal=False)
if not anvplayer_data:
continue
video = anvplayer_data.get('video')
if not isinstance(video, compat_str) or not video.isdigit():
continue
access_key = anvplayer_data.get('accessKey')
if not access_key:
mcp = anvplayer_data.get('mcp')
if mcp:
access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
mcp.lower())
if not access_key:
continue
entries.append(ie.url_result(
'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
video_id=video))
return entries
def _extract_anvato_videos(self, webpage, video_id):
anvplayer_data = self._parse_json(
self._html_search_regex(
self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
video_id)
return self._get_anvato_videos(
anvplayer_data['accessKey'], anvplayer_data['video'])
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
self._initialize_geo_bypass({
'countries': smuggled_data.get('geo_countries'),
})
mobj = re.match(self._VALID_URL, url)
access_key, video_id = mobj.group('access_key_or_mcp', 'id')
if access_key not in self._ANVACK_TABLE:
access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(
access_key) or access_key
return self._get_anvato_videos(access_key, video_id)

View File

@@ -0,0 +1,133 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import (
ExtractorError,
int_or_none,
url_or_none,
)
class AolIE(InfoExtractor):
IE_NAME = 'aol.com'
_VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>[0-9a-f]+)'
_TESTS = [{
# video with 5min ID
'url': 'https://www.aol.com/video/view/u-s--official-warns-of-largest-ever-irs-phone-scam/518167793/',
'md5': '18ef68f48740e86ae94b98da815eec42',
'info_dict': {
'id': '518167793',
'ext': 'mp4',
'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.',
'timestamp': 1395405060,
'upload_date': '20140321',
'uploader': 'Newsy Studio',
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
# video with vidible ID
'url': 'https://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/',
'info_dict': {
'id': '5707d6b8e4b090497b04f706',
'ext': 'mp4',
'title': 'Netflix is Raising Rates',
'description': 'Netflix is rewarding millions of its long-standing members with an increase in cost. Veuers Carly Figueroa has more.',
'upload_date': '20160408',
'timestamp': 1460123280,
'uploader': 'Veuer',
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
'url': 'https://www.aol.com/video/view/park-bench-season-2-trailer/559a1b9be4b0c3bfad3357a7/',
'only_matching': True,
}, {
'url': 'https://www.aol.com/video/view/donald-trump-spokeswoman-tones-down-megyn-kelly-attacks/519442220/',
'only_matching': True,
}, {
'url': 'aol-video:5707d6b8e4b090497b04f706',
'only_matching': True,
}, {
'url': 'https://www.aol.com/video/playlist/PL8245/5ca79d19d21f1a04035db606/',
'only_matching': True,
}, {
'url': 'https://www.aol.ca/video/view/u-s-woman-s-family-arrested-for-murder-first-pinned-on-panhandler-police/5c7ccf45bc03931fa04b2fe1/',
'only_matching': True,
}, {
'url': 'https://www.aol.co.uk/video/view/-one-dead-and-22-hurt-in-bus-crash-/5cb3a6f3d21f1a072b457347/',
'only_matching': True,
}, {
'url': 'https://www.aol.de/video/view/eva-braun-privataufnahmen-von-hitlers-geliebter-werden-digitalisiert/5cb2d49de98ab54c113d3d5d/',
'only_matching': True,
}, {
'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
response = self._download_json(
'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id,
video_id)['response']
if response['statusText'] != 'Ok':
raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True)
video_data = response['data']
formats = []
m3u8_url = url_or_none(video_data.get('videoMasterPlaylist'))
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
for rendition in video_data.get('renditions', []):
video_url = url_or_none(rendition.get('url'))
if not video_url:
continue
ext = rendition.get('format')
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
else:
f = {
'url': video_url,
'format_id': rendition.get('quality'),
}
mobj = re.search(r'(\d+)x(\d+)', video_url)
if mobj:
f.update({
'width': int(mobj.group(1)),
'height': int(mobj.group(2)),
})
else:
qs = compat_parse_qs(compat_urllib_parse_urlparse(video_url).query)
f.update({
'width': int_or_none(qs.get('w', [None])[0]),
'height': int_or_none(qs.get('h', [None])[0]),
})
formats.append(f)
self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id'))
return {
'id': video_id,
'title': video_data['title'],
'duration': int_or_none(video_data.get('duration')),
'timestamp': int_or_none(video_data.get('publishDate')),
'view_count': int_or_none(video_data.get('views')),
'description': video_data.get('description'),
'uploader': video_data.get('videoOwner'),
'formats': formats,
}

View File

@@ -0,0 +1,94 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
js_to_json,
url_or_none,
)
class APAIE(InfoExtractor):
_VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{
'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
'md5': '2b12292faeb0a7d930c778c7a5b4759b',
'info_dict': {
'id': 'jjv85FdZ',
'ext': 'mp4',
'title': '"Blau ist mysteriös": Die Blue Man Group im Interview',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 254,
'timestamp': 1519211149,
'upload_date': '20180221',
},
}, {
'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78',
'only_matching': True,
}, {
'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76',
'only_matching': True,
}, {
'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
webpage)]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
jwplatform_id = self._search_regex(
r'media[iI]d\s*:\s*["\'](?P<id>[a-zA-Z0-9]{8})', webpage,
'jwplatform id', default=None)
if jwplatform_id:
return self.url_result(
'jwplatform:' + jwplatform_id, ie='JWPlatform',
video_id=video_id)
sources = self._parse_json(
self._search_regex(
r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'),
video_id, transform_source=js_to_json)
formats = []
for source in sources:
if not isinstance(source, dict):
continue
source_url = url_or_none(source.get('file'))
if not source_url:
continue
ext = determine_ext(source_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
else:
formats.append({
'url': source_url,
})
self._sort_formats(formats)
thumbnail = self._search_regex(
r'image\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
'thumbnail', fatal=False, group='url')
return {
'id': video_id,
'title': video_id,
'thumbnail': thumbnail,
'formats': formats,
}

View File

@@ -0,0 +1,95 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
merge_dicts,
mimetype2ext,
url_or_none,
)
class AparatIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'http://www.aparat.com/v/wP8On',
'md5': '131aca2e14fe7c4dcb3c4877ba300c89',
'info_dict': {
'id': 'wP8On',
'ext': 'mp4',
'title': 'تیم گلکسی 11 - زومیت',
'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028',
'duration': 231,
'timestamp': 1387394859,
'upload_date': '20131218',
'view_count': int,
},
}, {
# multiple formats
'url': 'https://www.aparat.com/v/8dflw/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
# Provides more metadata
webpage = self._download_webpage(url, video_id, fatal=False)
if not webpage:
# Note: There is an easier-to-parse configuration at
# http://www.aparat.com/video/video/config/videohash/%video_id
# but the URL in there does not work
webpage = self._download_webpage(
'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
video_id)
options = self._parse_json(
self._search_regex(
r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1\s*\)',
webpage, 'options', group='value'),
video_id)
player = options['plugins']['sabaPlayerPlugin']
formats = []
for sources in player['multiSRC']:
for item in sources:
if not isinstance(item, dict):
continue
file_url = url_or_none(item.get('src'))
if not file_url:
continue
item_type = item.get('type')
if item_type == 'application/vnd.apple.mpegurl':
formats.extend(self._extract_m3u8_formats(
file_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls',
fatal=False))
else:
ext = mimetype2ext(item.get('type'))
label = item.get('label')
formats.append({
'url': file_url,
'ext': ext,
'format_id': 'http-%s' % (label or ext),
'height': int_or_none(self._search_regex(
r'(\d+)[pP]', label or '', 'height',
default=None)),
})
self._sort_formats(
formats, field_preference=('height', 'width', 'tbr', 'format_id'))
info = self._search_json_ld(webpage, video_id, default={})
if not info.get('title'):
info['title'] = player['title']
return merge_dicts(info, {
'id': video_id,
'thumbnail': url_or_none(options.get('poster')),
'duration': int_or_none(player.get('duration')),
'formats': formats,
})

View File

@@ -0,0 +1,50 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
str_to_int,
ExtractorError
)
class AppleConnectIE(InfoExtractor):
_VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
_TEST = {
'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'md5': 'e7c38568a01ea45402570e6029206723',
'info_dict': {
'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'ext': 'm4v',
'title': 'Energy',
'uploader': 'Drake',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20150710',
'timestamp': 1436545535,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
try:
video_json = self._html_search_regex(
r'class="auc-video-data">(\{.*?\})', webpage, 'json')
except ExtractorError:
raise ExtractorError('This post doesn\'t contain a video', expected=True)
video_data = self._parse_json(video_json, video_id)
timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp'))
like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count'))
return {
'id': video_id,
'url': video_data['sslSrc'],
'title': video_data['title'],
'description': video_data['description'],
'uploader': video_data['artistName'],
'thumbnail': video_data['artworkUrl'],
'timestamp': timestamp,
'like_count': like_count,
}

View File

@@ -0,0 +1,283 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
int_or_none,
parse_duration,
unified_strdate,
)
class AppleTrailersIE(InfoExtractor):
IE_NAME = 'appletrailers'
_VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
_TESTS = [{
'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
'info_dict': {
'id': '5111',
'title': 'Man of Steel',
},
'playlist': [
{
'md5': 'd97a8e575432dbcb81b7c3acb741f8a8',
'info_dict': {
'id': 'manofsteel-trailer4',
'ext': 'mov',
'duration': 111,
'title': 'Trailer 4',
'upload_date': '20130523',
'uploader_id': 'wb',
},
},
{
'md5': 'b8017b7131b721fb4e8d6f49e1df908c',
'info_dict': {
'id': 'manofsteel-trailer3',
'ext': 'mov',
'duration': 182,
'title': 'Trailer 3',
'upload_date': '20130417',
'uploader_id': 'wb',
},
},
{
'md5': 'd0f1e1150989b9924679b441f3404d48',
'info_dict': {
'id': 'manofsteel-trailer',
'ext': 'mov',
'duration': 148,
'title': 'Trailer',
'upload_date': '20121212',
'uploader_id': 'wb',
},
},
{
'md5': '5fe08795b943eb2e757fa95cb6def1cb',
'info_dict': {
'id': 'manofsteel-teaser',
'ext': 'mov',
'duration': 93,
'title': 'Teaser',
'upload_date': '20120721',
'uploader_id': 'wb',
},
},
]
}, {
'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
'info_dict': {
'id': '4489',
'title': 'Blackthorn',
},
'playlist_mincount': 2,
'expected_warnings': ['Unable to download JSON metadata'],
}, {
# json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json
'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/',
'info_dict': {
'id': '15881',
'title': 'Kung Fu Panda 3',
},
'playlist_mincount': 4,
}, {
'url': 'http://trailers.apple.com/ca/metropole/autrui/',
'only_matching': True,
}, {
'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/',
'only_matching': True,
}]
_JSON_RE = r'iTunes.playURL\((.*?)\);'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
movie = mobj.group('movie')
uploader_id = mobj.group('company')
webpage = self._download_webpage(url, movie)
film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id')
film_data = self._download_json(
'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id,
film_id, fatal=False)
if film_data:
entries = []
for clip in film_data.get('clips', []):
clip_title = clip['title']
formats = []
for version, version_data in clip.get('versions', {}).items():
for size, size_data in version_data.get('sizes', {}).items():
src = size_data.get('src')
if not src:
continue
formats.append({
'format_id': '%s-%s' % (version, size),
'url': re.sub(r'_(\d+p\.mov)', r'_h\1', src),
'width': int_or_none(size_data.get('width')),
'height': int_or_none(size_data.get('height')),
'language': version[:2],
})
self._sort_formats(formats)
entries.append({
'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(),
'formats': formats,
'title': clip_title,
'thumbnail': clip.get('screen') or clip.get('thumb'),
'duration': parse_duration(clip.get('runtime') or clip.get('faded')),
'upload_date': unified_strdate(clip.get('posted')),
'uploader_id': uploader_id,
})
page_data = film_data.get('page', {})
return self.playlist_result(entries, film_id, page_data.get('movie_title'))
playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
def fix_html(s):
s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s)
# The ' in the onClick attributes are not escaped, it couldn't be parsed
# like: http://trailers.apple.com/trailers/wb/gravity/
def _clean_json(m):
return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
s = re.sub(self._JSON_RE, _clean_json, s)
s = '<html>%s</html>' % s
return s
doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
playlist = []
for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick']
trailer_info_json = self._search_regex(self._JSON_RE,
on_click, 'trailer info')
trailer_info = json.loads(trailer_info_json)
first_url = trailer_info.get('url')
if not first_url:
continue
title = trailer_info['title']
video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
thumbnail = li.find('.//img').attrib['src']
upload_date = trailer_info['posted'].replace('-', '')
runtime = trailer_info['runtime']
m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)
duration = None
if m:
duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
formats = []
for format in settings['metadata']['sizes']:
# The src is a file pointing to the real video file
format_url = re.sub(r'_(\d*p\.mov)', r'_h\1', format['src'])
formats.append({
'url': format_url,
'format': format['type'],
'width': int_or_none(format['width']),
'height': int_or_none(format['height']),
})
self._sort_formats(formats)
playlist.append({
'_type': 'video',
'id': video_id,
'formats': formats,
'title': title,
'duration': duration,
'thumbnail': thumbnail,
'upload_date': upload_date,
'uploader_id': uploader_id,
'http_headers': {
'User-Agent': 'QuickTime compatible (youtube-dlc)',
},
})
return {
'_type': 'playlist',
'id': movie,
'entries': playlist,
}
class AppleTrailersSectionIE(InfoExtractor):
IE_NAME = 'appletrailers:section'
_SECTIONS = {
'justadded': {
'feed_path': 'just_added',
'title': 'Just Added',
},
'exclusive': {
'feed_path': 'exclusive',
'title': 'Exclusive',
},
'justhd': {
'feed_path': 'just_hd',
'title': 'Just HD',
},
'mostpopular': {
'feed_path': 'most_pop',
'title': 'Most Popular',
},
'moviestudios': {
'feed_path': 'studios',
'title': 'Movie Studios',
},
}
_VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS)
_TESTS = [{
'url': 'http://trailers.apple.com/#section=justadded',
'info_dict': {
'title': 'Just Added',
'id': 'justadded',
},
'playlist_mincount': 80,
}, {
'url': 'http://trailers.apple.com/#section=exclusive',
'info_dict': {
'title': 'Exclusive',
'id': 'exclusive',
},
'playlist_mincount': 80,
}, {
'url': 'http://trailers.apple.com/#section=justhd',
'info_dict': {
'title': 'Just HD',
'id': 'justhd',
},
'playlist_mincount': 80,
}, {
'url': 'http://trailers.apple.com/#section=mostpopular',
'info_dict': {
'title': 'Most Popular',
'id': 'mostpopular',
},
'playlist_mincount': 30,
}, {
'url': 'http://trailers.apple.com/#section=moviestudios',
'info_dict': {
'title': 'Movie Studios',
'id': 'moviestudios',
},
'playlist_mincount': 80,
}]
def _real_extract(self, url):
section = self._match_id(url)
section_data = self._download_json(
'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'],
section)
entries = [
self.url_result('http://trailers.apple.com' + e['location'])
for e in section_data]
return self.playlist_result(entries, section, self._SECTIONS[section]['title'])

View File

@@ -0,0 +1,65 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
unified_strdate,
clean_html,
)
class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
IE_DESC = 'archive.org videos'
_VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$'
_TESTS = [{
'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'md5': '8af1d4cf447933ed3c7f4871162602db',
'info_dict': {
'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'ext': 'ogg',
'title': '1968 Demo - FJCC Conference Presentation Reel #1',
'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
'upload_date': '19681210',
'uploader': 'SRI International'
}
}, {
'url': 'https://archive.org/details/Cops1922',
'md5': '0869000b4ce265e8ca62738b336b268a',
'info_dict': {
'id': 'Cops1922',
'ext': 'mp4',
'title': 'Buster Keaton\'s "Cops" (1922)',
'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6',
}
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://archive.org/embed/' + video_id, video_id)
jwplayer_playlist = self._parse_json(self._search_regex(
r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)",
webpage, 'jwplayer playlist'), video_id)
info = self._parse_jwplayer_data(
{'playlist': jwplayer_playlist}, video_id, base_url=url)
def get_optional(metadata, field):
return metadata.get(field, [None])[0]
metadata = self._download_json(
'http://archive.org/details/' + video_id, video_id, query={
'output': 'json',
})['metadata']
info.update({
'title': get_optional(metadata, 'title') or info.get('title'),
'description': clean_html(get_optional(metadata, 'description')),
})
if info.get('_type') != 'playlist':
info.update({
'uploader': get_optional(metadata, 'creator'),
'upload_date': unified_strdate(get_optional(metadata, 'date')),
})
return info

View File

@@ -0,0 +1,422 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from .generic import GenericIE
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
parse_duration,
qualities,
str_or_none,
try_get,
unified_strdate,
unified_timestamp,
update_url_query,
url_or_none,
xpath_text,
)
from ..compat import compat_etree_fromstring
class ARDMediathekBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['DE']
def _extract_media_info(self, media_info_url, webpage, video_id):
media_info = self._download_json(
media_info_url, video_id, 'Downloading media JSON')
return self._parse_media_info(media_info, video_id, '"fsk"' in webpage)
def _parse_media_info(self, media_info, video_id, fsk):
formats = self._extract_formats(media_info, video_id)
if not formats:
if fsk:
raise ExtractorError(
'This video is only available after 20:00', expected=True)
elif media_info.get('_geoblocked'):
self.raise_geo_restricted(
'This video is not available due to geoblocking',
countries=self._GEO_COUNTRIES)
self._sort_formats(formats)
subtitles = {}
subtitle_url = media_info.get('_subtitleUrl')
if subtitle_url:
subtitles['de'] = [{
'ext': 'ttml',
'url': subtitle_url,
}]
return {
'id': video_id,
'duration': int_or_none(media_info.get('_duration')),
'thumbnail': media_info.get('_previewImage'),
'is_live': media_info.get('_isLive') is True,
'formats': formats,
'subtitles': subtitles,
}
def _extract_formats(self, media_info, video_id):
type_ = media_info.get('_type')
media_array = media_info.get('_mediaArray', [])
formats = []
for num, media in enumerate(media_array):
for stream in media.get('_mediaStreamArray', []):
stream_urls = stream.get('_stream')
if not stream_urls:
continue
if not isinstance(stream_urls, list):
stream_urls = [stream_urls]
quality = stream.get('_quality')
server = stream.get('_server')
for stream_url in stream_urls:
if not url_or_none(stream_url):
continue
ext = determine_ext(stream_url)
if quality != 'auto' and ext in ('f4m', 'm3u8'):
continue
if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
update_url_query(stream_url, {
'hdcore': '3.1.1',
'plugin': 'aasp-3.1.1.69.124'
}), video_id, f4m_id='hds', fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
stream_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
else:
if server and server.startswith('rtmp'):
f = {
'url': server,
'play_path': stream_url,
'format_id': 'a%s-rtmp-%s' % (num, quality),
}
else:
f = {
'url': stream_url,
'format_id': 'a%s-%s-%s' % (num, ext, quality)
}
m = re.search(
r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$',
stream_url)
if m:
f.update({
'width': int(m.group('width')),
'height': int(m.group('height')),
})
if type_ == 'audio':
f['vcodec'] = 'none'
formats.append(f)
return formats
class ARDMediathekIE(ARDMediathekBaseIE):
IE_NAME = 'ARD:mediathek'
_VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
_TESTS = [{
# available till 26.07.2022
'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
'info_dict': {
'id': '44726822',
'ext': 'mp4',
'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
'duration': 1740,
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
'only_matching': True,
}, {
# audio
'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
'only_matching': True,
}, {
'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
'only_matching': True,
}, {
# audio
'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
'only_matching': True,
}, {
'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
def _real_extract(self, url):
# determine video id from url
m = re.match(self._VALID_URL, url)
document_id = None
numid = re.search(r'documentId=([0-9]+)', url)
if numid:
document_id = video_id = numid.group(1)
else:
video_id = m.group('video_id')
webpage = self._download_webpage(url, video_id)
ERRORS = (
('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
'Video %s is no longer available'),
)
for pattern, message in ERRORS:
if pattern in webpage:
raise ExtractorError(message % video_id, expected=True)
if re.search(r'[\?&]rss($|[=&])', url):
doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return GenericIE()._extract_rss(url, video_id, doc)
title = self._html_search_regex(
[r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
r'<meta name="dcterms\.title" content="(.*?)"/>',
r'<h4 class="headline">(.*?)</h4>',
r'<title[^>]*>(.*?)</title>'],
webpage, 'title')
description = self._html_search_meta(
'dcterms.abstract', webpage, 'description', default=None)
if description is None:
description = self._html_search_meta(
'description', webpage, 'meta description', default=None)
if description is None:
description = self._html_search_regex(
r'<p\s+class="teasertext">(.+?)</p>',
webpage, 'teaser text', default=None)
# Thumbnail is sometimes not present.
# It is in the mobile version, but that seems to use a different URL
# structure altogether.
thumbnail = self._og_search_thumbnail(webpage, default=None)
media_streams = re.findall(r'''(?x)
mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
"([^"]+)"''', webpage)
if media_streams:
QUALITIES = qualities(['lo', 'hi', 'hq'])
formats = []
for furl in set(media_streams):
if furl.endswith('.f4m'):
fid = 'f4m'
else:
fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
fid = fid_m.group(1) if fid_m else None
formats.append({
'quality': QUALITIES(fid),
'format_id': fid,
'url': furl,
})
self._sort_formats(formats)
info = {
'formats': formats,
}
else: # request JSON file
if not document_id:
video_id = self._search_regex(
r'/play/(?:config|media)/(\d+)', webpage, 'media id')
info = self._extract_media_info(
'http://www.ardmediathek.de/play/media/%s' % video_id,
webpage, video_id)
info.update({
'id': video_id,
'title': self._live_title(title) if info.get('is_live') else title,
'description': description,
'thumbnail': thumbnail,
})
return info
class ARDIE(InfoExtractor):
_VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
_TESTS = [{
# available till 14.02.2019
'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html',
'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49',
'info_dict': {
'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video',
'id': '102',
'ext': 'mp4',
'duration': 4435.0,
'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?',
'upload_date': '20180214',
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html',
'only_matching': True,
}, {
'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
player_url = mobj.group('mainurl') + '~playerXml.xml'
doc = self._download_xml(player_url, display_id)
video_node = doc.find('./video')
upload_date = unified_strdate(xpath_text(
video_node, './broadcastDate'))
thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
formats = []
for a in video_node.findall('.//asset'):
f = {
'format_id': a.attrib['type'],
'width': int_or_none(a.find('./frameWidth').text),
'height': int_or_none(a.find('./frameHeight').text),
'vbr': int_or_none(a.find('./bitrateVideo').text),
'abr': int_or_none(a.find('./bitrateAudio').text),
'vcodec': a.find('./codecVideo').text,
'tbr': int_or_none(a.find('./totalBitrate').text),
}
if a.find('./serverPrefix').text:
f['url'] = a.find('./serverPrefix').text
f['playpath'] = a.find('./fileName').text
else:
f['url'] = a.find('./fileName').text
formats.append(f)
self._sort_formats(formats)
return {
'id': mobj.group('id'),
'formats': formats,
'display_id': display_id,
'title': video_node.find('./title').text,
'duration': parse_duration(video_node.find('./duration').text),
'upload_date': upload_date,
'thumbnail': thumbnail,
}
class ARDBetaMediathekIE(ARDMediathekBaseIE):
_VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?:player|live|video)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f',
'info_dict': {
'display_id': 'die-robuste-roswita',
'id': '70153354',
'title': 'Die robuste Roswita',
'description': r're:^Der Mord.*trüber ist als die Ilm.',
'duration': 5316,
'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard',
'timestamp': 1577047500,
'upload_date': '20191222',
'ext': 'mp4',
},
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
'only_matching': True,
}, {
'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
display_id = mobj.group('display_id')
if display_id:
display_id = display_id.rstrip('/')
if not display_id:
display_id = video_id
player_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
display_id, data=json.dumps({
'query': '''{
playerPage(client:"%s", clipId: "%s") {
blockedByFsk
broadcastedOn
maturityContentRating
mediaCollection {
_duration
_geoblocked
_isLive
_mediaArray {
_mediaStreamArray {
_quality
_server
_stream
}
}
_previewImage
_subtitleUrl
_type
}
show {
title
}
synopsis
title
tracking {
atiCustomVars {
contentId
}
}
}
}''' % (mobj.group('client'), video_id),
}).encode(), headers={
'Content-Type': 'application/json'
})['data']['playerPage']
title = player_page['title']
content_id = str_or_none(try_get(
player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
media_collection = player_page.get('mediaCollection') or {}
if not media_collection and content_id:
media_collection = self._download_json(
'https://www.ardmediathek.de/play/media/' + content_id,
content_id, fatal=False) or {}
info = self._parse_media_info(
media_collection, content_id or video_id,
player_page.get('blockedByFsk'))
age_limit = None
description = player_page.get('synopsis')
maturity_content_rating = player_page.get('maturityContentRating')
if maturity_content_rating:
age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
if not age_limit and description:
age_limit = int_or_none(self._search_regex(
r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
info.update({
'age_limit': age_limit,
'display_id': display_id,
'title': title,
'description': description,
'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
'series': try_get(player_page, lambda x: x['show']['title']),
})
return info

View File

@@ -0,0 +1,133 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
int_or_none,
mimetype2ext,
parse_iso8601,
strip_jsonp,
)
class ArkenaIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
video\.arkena\.com/play2/embed/player\?|
play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)
)
'''
_TESTS = [{
'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411',
'md5': 'b96f2f71b359a8ecd05ce4e1daa72365',
'info_dict': {
'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe',
'ext': 'mp4',
'title': 'Big Buck Bunny',
'description': 'Royalty free test video',
'timestamp': 1432816365,
'upload_date': '20150528',
'is_live': False,
},
}, {
'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893',
'only_matching': True,
}, {
'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972',
'only_matching': True,
}, {
'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/',
'only_matching': True,
}, {
'url': 'http://video.arkena.com/play2/embed/player?accountId=472718&mediaId=35763b3b-00090078-bf604299&pageStyling=styled',
'only_matching': True,
}]
@staticmethod
def _extract_url(webpage):
# See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
mobj = re.search(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1',
webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
account_id = mobj.group('account_id')
# Handle http://video.arkena.com/play2/embed/player URL
if not video_id:
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
video_id = qs.get('mediaId', [None])[0]
account_id = qs.get('accountId', [None])[0]
if not video_id or not account_id:
raise ExtractorError('Invalid URL', expected=True)
playlist = self._download_json(
'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_'
% (video_id, account_id),
video_id, transform_source=strip_jsonp)['Playlist'][0]
media_info = playlist['MediaInfo']
title = media_info['Title']
media_files = playlist['MediaFiles']
is_live = False
formats = []
for kind_case, kind_formats in media_files.items():
kind = kind_case.lower()
for f in kind_formats:
f_url = f.get('Url')
if not f_url:
continue
is_live = f.get('Live') == 'true'
exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None))
if kind == 'm3u8' or 'm3u8' in exts:
formats.extend(self._extract_m3u8_formats(
f_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=kind, fatal=False, live=is_live))
elif kind == 'flash' or 'f4m' in exts:
formats.extend(self._extract_f4m_formats(
f_url, video_id, f4m_id=kind, fatal=False))
elif kind == 'dash' or 'mpd' in exts:
formats.extend(self._extract_mpd_formats(
f_url, video_id, mpd_id=kind, fatal=False))
elif kind == 'silverlight':
# TODO: process when ism is supported (see
# https://github.com/ytdl-org/youtube-dl/issues/8118)
continue
else:
tbr = float_or_none(f.get('Bitrate'), 1000)
formats.append({
'url': f_url,
'format_id': '%s-%d' % (kind, tbr) if tbr else kind,
'tbr': tbr,
})
self._sort_formats(formats)
description = media_info.get('Description')
video_id = media_info.get('VideoId') or video_id
timestamp = parse_iso8601(media_info.get('PublishDate'))
thumbnails = [{
'url': thumbnail['Url'],
'width': int_or_none(thumbnail.get('Size')),
} for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')]
return {
'id': video_id,
'title': title,
'description': description,
'timestamp': timestamp,
'is_live': is_live,
'thumbnails': thumbnails,
'formats': formats,
}

View File

@@ -0,0 +1,201 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
qualities,
try_get,
unified_strdate,
)
# There are different sources of video in arte.tv, the extraction process
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
class ArteTVBaseIE(InfoExtractor):
def _extract_from_json_url(self, json_url, video_id, lang, title=None):
info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
vsr = try_get(player_info, lambda x: x['VSR'], dict)
if not vsr:
error = None
if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error':
error = try_get(
player_info, lambda x: x['custom_msg']['msg'], compat_str)
if not error:
error = 'Video %s is not available' % player_info.get('VID') or video_id
raise ExtractorError(error, expected=True)
upload_date_str = player_info.get('shootingDate')
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
title = (player_info.get('VTI') or title or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
info_dict = {
'id': player_info['VID'],
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
LANGS = {
'fr': 'F',
'de': 'A',
'en': 'E[ANG]',
'es': 'E[ESP]',
'it': 'E[ITA]',
'pl': 'E[POL]',
}
langcode = LANGS.get(lang, lang)
formats = []
for format_id, format_dict in vsr.items():
f = dict(format_dict)
versionCode = f.get('versionCode')
l = re.escape(langcode)
# Language preference from most to least priority
# Reference: section 6.8 of
# https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf
PREFERENCES = (
# original version in requested language, without subtitles
r'VO{0}$'.format(l),
# original version in requested language, with partial subtitles in requested language
r'VO{0}-ST{0}$'.format(l),
# original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
r'VO{0}-STM{0}$'.format(l),
# non-original (dubbed) version in requested language, without subtitles
r'V{0}$'.format(l),
# non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
r'V{0}-ST{0}$'.format(l),
# non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
r'V{0}-STM{0}$'.format(l),
# original version in requested language, with partial subtitles in different language
r'VO{0}-ST(?!{0}).+?$'.format(l),
# original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
r'VO{0}-STM(?!{0}).+?$'.format(l),
# original version in different language, with partial subtitles in requested language
r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
# original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
# original version in different language, without subtitles
r'VO(?:(?!{0}))?$'.format(l),
# original version in different language, with partial subtitles in different language
r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
# original version in different language, with subtitles for the deaf and hard-of-hearing in different language
r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
)
for pref, p in enumerate(PREFERENCES):
if re.match(p, versionCode):
lang_pref = len(PREFERENCES) - pref
break
else:
lang_pref = -1
format = {
'format_id': format_id,
'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
'language_preference': lang_pref,
'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),
'tbr': int_or_none(f.get('bitrate')),
'quality': qfunc(f.get('quality')),
}
if f.get('mediaType') == 'rtmp':
format['url'] = f['streamer']
format['play_path'] = 'mp4:' + f['url']
format['ext'] = 'flv'
else:
format['url'] = f['url']
formats.append(format)
self._check_formats(formats, video_id)
self._sort_formats(formats)
info_dict['formats'] = formats
return info_dict
class ArteTVPlus7IE(ArteTVBaseIE):
IE_NAME = 'arte.tv:+7'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'info_dict': {
'id': '088501-000-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
},
}]
def _real_extract(self, url):
lang, video_id = re.match(self._VALID_URL, url).groups()
return self._extract_from_json_url(
'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id),
video_id, lang)
class ArteTVEmbedIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:embed'
_VALID_URL = r'''(?x)
https://www\.arte\.tv
/player/v3/index\.php\?json_url=
(?P<json_url>
https?://api\.arte\.tv/api/player/v1/config/
(?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
)
'''
_TESTS = []
def _real_extract(self, url):
json_url, lang, video_id = re.match(self._VALID_URL, url).groups()
return self._extract_from_json_url(json_url, video_id, lang)
class ArteTVPlaylistIE(ArteTVBaseIE):
IE_NAME = 'arte.tv:playlist'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': {
'id': 'RC-016954',
'title': 'Earn a Living',
'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
},
'playlist_mincount': 6,
}]
def _real_extract(self, url):
lang, playlist_id = re.match(self._VALID_URL, url).groups()
collection = self._download_json(
'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
% (lang, playlist_id), playlist_id)
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
entries = [
self._extract_from_json_url(
video['jsonUrl'], video.get('programId') or playlist_id, lang)
for video in collection['videos'] if video.get('jsonUrl')]
return self.playlist_result(entries, playlist_id, title, description)

View File

@@ -0,0 +1,145 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .kaltura import KalturaIE
from ..utils import extract_attributes
class AsianCrushIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|cocoro\.tv))'
_VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % _VALID_URL_BASE
_TESTS = [{
'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/',
'md5': 'c3b740e48d0ba002a42c0b72857beae6',
'info_dict': {
'id': '1_y4tmjm5r',
'ext': 'mp4',
'title': 'Women Who Flirt',
'description': 'md5:7e986615808bcfb11756eb503a751487',
'timestamp': 1496936429,
'upload_date': '20170608',
'uploader_id': 'craig@crifkin.com',
},
}, {
'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/',
'only_matching': True,
}, {
'url': 'https://www.yuyutv.com/video/013886v/the-act-of-killing/',
'only_matching': True,
}, {
'url': 'https://www.yuyutv.com/video/peep-show/013922v-warring-factions/',
'only_matching': True,
}, {
'url': 'https://www.midnightpulp.com/video/010400v/drifters/',
'only_matching': True,
}, {
'url': 'https://www.midnightpulp.com/video/mononoke/016378v-zashikiwarashi-part-1/',
'only_matching': True,
}, {
'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
host = mobj.group('host')
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
entry_id, partner_id, title = [None] * 3
vars = self._parse_json(
self._search_regex(
r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars',
default='{}'), video_id, fatal=False)
if vars:
entry_id = vars.get('entry_id')
partner_id = vars.get('partner_id')
title = vars.get('vid_label')
if not entry_id:
entry_id = self._search_regex(
r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id')
player = self._download_webpage(
'https://api.%s/embeddedVideoPlayer' % host, video_id,
query={'id': entry_id})
kaltura_id = self._search_regex(
r'entry_id["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', player,
'kaltura id', group='id')
if not partner_id:
partner_id = self._search_regex(
r'/p(?:artner_id)?/(\d+)', player, 'partner id',
default='513551')
description = self._html_search_regex(
r'(?s)<div[^>]+\bclass=["\']description["\'][^>]*>(.+?)</div>',
webpage, 'description', fatal=False)
return {
'_type': 'url_transparent',
'url': 'kaltura:%s:%s' % (partner_id, kaltura_id),
'ie_key': KalturaIE.ie_key(),
'id': video_id,
'title': title,
'description': description,
}
class AsianCrushPlaylistIE(InfoExtractor):
_VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushIE._VALID_URL_BASE
_TESTS = [{
'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/',
'info_dict': {
'id': '12481',
'title': 'Scholar Who Walks the Night',
'description': 'md5:7addd7c5132a09fd4741152d96cce886',
},
'playlist_count': 20,
}, {
'url': 'https://www.yuyutv.com/series/013920s/peep-show/',
'only_matching': True,
}, {
'url': 'https://www.midnightpulp.com/series/016375s/mononoke/',
'only_matching': True,
}, {
'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/',
'only_matching': True,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
entries = []
for mobj in re.finditer(
r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL,
webpage):
attrs = extract_attributes(mobj.group(0))
if attrs.get('class') == 'clearfix':
entries.append(self.url_result(
mobj.group('url'), ie=AsianCrushIE.ie_key()))
title = self._html_search_regex(
r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage,
'title', default=None) or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title',
default=None) or self._search_regex(
r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
if title:
title = re.sub(r'\s*\|\s*.+?$', '', title)
description = self._og_search_description(
webpage, default=None) or self._html_search_meta(
'twitter:description', webpage, 'description', fatal=False)
return self.playlist_result(entries, playlist_id, title, description)

View File

@@ -0,0 +1,118 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
urlencode_postdata,
)
class AtresPlayerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P<display_id>.+?)_(?P<id>[0-9a-f]{24})'
_NETRC_MACHINE = 'atresplayer'
_TESTS = [
{
'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/',
'info_dict': {
'id': '5d4aa2c57ed1a88fc715a615',
'ext': 'mp4',
'title': 'Capítulo 7: Asuntos pendientes',
'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc',
'duration': 3413,
},
'params': {
'format': 'bestvideo',
},
'skip': 'This video is only available for registered users'
},
{
'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/',
'only_matching': True,
},
{
'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/',
'only_matching': True,
},
]
_API_BASE = 'https://api.atresplayer.com/'
def _real_initialize(self):
self._login()
def _handle_error(self, e, code):
if isinstance(e.cause, compat_HTTPError) and e.cause.code == code:
error = self._parse_json(e.cause.read(), None)
if error.get('error') == 'required_registered':
self.raise_login_required()
raise ExtractorError(error['error_description'], expected=True)
raise
def _login(self):
username, password = self._get_login_info()
if username is None:
return
self._request_webpage(
self._API_BASE + 'login', None, 'Downloading login page')
try:
target_url = self._download_json(
'https://account.atresmedia.com/api/login', None,
'Logging in', headers={
'Content-Type': 'application/x-www-form-urlencoded'
}, data=urlencode_postdata({
'username': username,
'password': password,
}))['targetUrl']
except ExtractorError as e:
self._handle_error(e, 400)
self._request_webpage(target_url, None, 'Following Target URL')
def _real_extract(self, url):
display_id, video_id = re.match(self._VALID_URL, url).groups()
try:
episode = self._download_json(
self._API_BASE + 'client/v1/player/episode/' + video_id, video_id)
except ExtractorError as e:
self._handle_error(e, 403)
title = episode['titulo']
formats = []
for source in episode.get('sources', []):
src = source.get('src')
if not src:
continue
src_type = source.get('type')
if src_type == 'application/vnd.apple.mpegurl':
formats.extend(self._extract_m3u8_formats(
src, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
elif src_type == 'application/dash+xml':
formats.extend(self._extract_mpd_formats(
src, video_id, mpd_id='dash', fatal=False))
self._sort_formats(formats)
heartbeat = episode.get('heartbeat') or {}
omniture = episode.get('omniture') or {}
get_meta = lambda x: heartbeat.get(x) or omniture.get(x)
return {
'display_id': display_id,
'id': video_id,
'title': title,
'description': episode.get('descripcion'),
'thumbnail': episode.get('imgPoster'),
'duration': int_or_none(episode.get('duration')),
'formats': formats,
'channel': get_meta('channel'),
'season': get_meta('season'),
'episode_number': int_or_none(get_meta('episodeNumber')),
}

View File

@@ -0,0 +1,55 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import unified_strdate
class ATTTechChannelIE(InfoExtractor):
_VALID_URL = r'https?://techchannel\.att\.com/play-video\.cfm/([^/]+/)*(?P<id>.+)'
_TEST = {
'url': 'http://techchannel.att.com/play-video.cfm/2014/1/27/ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use',
'info_dict': {
'id': '11316',
'display_id': 'ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use',
'ext': 'flv',
'title': 'AT&T Archives : The UNIX System: Making Computers Easier to Use',
'description': 'A 1982 film about UNIX is the foundation for software in use around Bell Labs and AT&T.',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140127',
},
'params': {
# rtmp download
'skip_download': True,
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_url = self._search_regex(
r"url\s*:\s*'(rtmp://[^']+)'",
webpage, 'video URL')
video_id = self._search_regex(
r'mediaid\s*=\s*(\d+)',
webpage, 'video id', fatal=False)
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)
upload_date = unified_strdate(self._search_regex(
r'[Rr]elease\s+date:\s*(\d{1,2}/\d{1,2}/\d{4})',
webpage, 'upload date', fatal=False), False)
return {
'id': video_id,
'display_id': display_id,
'url': video_url,
'ext': 'flv',
'title': title,
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
}

View File

@@ -0,0 +1,75 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
unescapeHTML,
)
class ATVAtIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P<id>[dv]\d+)'
_TESTS = [{
'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/',
'md5': 'c3b6b975fb3150fc628572939df205f2',
'info_dict': {
'id': '1698447',
'ext': 'mp4',
'title': 'DI, 21.03.17 | 20:05 Uhr 1/1',
}
}, {
'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_data = self._parse_json(unescapeHTML(self._search_regex(
[r'flashPlayerOptions\s*=\s*(["\'])(?P<json>(?:(?!\1).)+)\1',
r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P<json>[^"]+)"'],
webpage, 'player data', group='json')),
display_id)['config']['initial_video']
video_id = video_data['id']
video_title = video_data['title']
parts = []
for part in video_data.get('parts', []):
part_id = part['id']
part_title = part['title']
formats = []
for source in part.get('sources', []):
source_url = source.get('src')
if not source_url:
continue
ext = determine_ext(source_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
source_url, part_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
else:
formats.append({
'format_id': source.get('delivery'),
'url': source_url,
})
self._sort_formats(formats)
parts.append({
'id': part_id,
'title': part_title,
'thumbnail': part.get('preview_image_url'),
'duration': int_or_none(part.get('duration')),
'is_live': part.get('is_livestream'),
'formats': formats,
})
return {
'_type': 'multi_video',
'id': video_id,
'title': video_title,
'entries': parts,
}

View File

@@ -0,0 +1,93 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
)
class AudiMediaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?:video/)?(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467',
'md5': '79a8b71c46d49042609795ab59779b66',
'info_dict': {
'id': '1565',
'ext': 'mp4',
'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test',
'description': 'md5:60e5d30a78ced725f7b8d34370762941',
'upload_date': '20151124',
'timestamp': 1448354940,
'duration': 74022,
'view_count': int,
}
}, {
'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
raw_payload = self._search_regex([
r'class="amtv-embed"[^>]+id="([0-9a-z-]+)"',
r'id="([0-9a-z-]+)"[^>]+class="amtv-embed"',
r'class=\\"amtv-embed\\"[^>]+id=\\"([0-9a-z-]+)\\"',
r'id=\\"([0-9a-z-]+)\\"[^>]+class=\\"amtv-embed\\"',
r'id=(?:\\)?"(amtve-[a-z]-\d+-[a-z]{2})',
], webpage, 'raw payload')
_, stage_mode, video_id, _ = raw_payload.split('-')
# TODO: handle s and e stage_mode (live streams and ended live streams)
if stage_mode not in ('s', 'e'):
video_data = self._download_json(
'https://www.audimedia.tv/api/video/v1/videos/' + video_id,
video_id, query={
'embed[]': ['video_versions', 'thumbnail_image'],
})['results']
formats = []
stream_url_hls = video_data.get('stream_url_hls')
if stream_url_hls:
formats.extend(self._extract_m3u8_formats(
stream_url_hls, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
stream_url_hds = video_data.get('stream_url_hds')
if stream_url_hds:
formats.extend(self._extract_f4m_formats(
stream_url_hds + '?hdcore=3.4.0',
video_id, f4m_id='hds', fatal=False))
for video_version in video_data.get('video_versions', []):
video_version_url = video_version.get('download_url') or video_version.get('stream_url')
if not video_version_url:
continue
f = {
'url': video_version_url,
'width': int_or_none(video_version.get('width')),
'height': int_or_none(video_version.get('height')),
'abr': int_or_none(video_version.get('audio_bitrate')),
'vbr': int_or_none(video_version.get('video_bitrate')),
}
bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None)
if bitrate:
f.update({
'format_id': 'http-%s' % bitrate,
})
formats.append(f)
self._sort_formats(formats)
return {
'id': video_id,
'title': video_data['title'],
'description': video_data.get('subtitle'),
'thumbnail': video_data.get('thumbnail_image', {}).get('file'),
'timestamp': parse_iso8601(video_data.get('publication_date')),
'duration': int_or_none(video_data.get('duration')),
'view_count': int_or_none(video_data.get('view_count')),
'formats': formats,
}

View File

@@ -0,0 +1,73 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
float_or_none,
)
class AudioBoomIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://audioboom.com/posts/7398103-asim-chaudhry',
'md5': '7b00192e593ff227e6a315486979a42d',
'info_dict': {
'id': '7398103',
'ext': 'mp3',
'title': 'Asim Chaudhry',
'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc',
'duration': 4000.99,
'uploader': 'Sue Perkins: An hour or so with...',
'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins',
}
}, {
'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
clip = None
clip_store = self._parse_json(
self._html_search_regex(
r'data-new-clip-store=(["\'])(?P<json>{.+?})\1',
webpage, 'clip store', default='{}', group='json'),
video_id, fatal=False)
if clip_store:
clips = clip_store.get('clips')
if clips and isinstance(clips, list) and isinstance(clips[0], dict):
clip = clips[0]
def from_clip(field):
if clip:
return clip.get(field)
audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
'audio', webpage, 'audio url')
title = from_clip('title') or self._html_search_meta(
['og:title', 'og:audio:title', 'audio_title'], webpage)
description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage)
duration = float_or_none(from_clip('duration') or self._html_search_meta(
'weibo:audio:duration', webpage))
uploader = from_clip('author') or self._html_search_meta(
['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader')
uploader_url = from_clip('author_url') or self._html_search_meta(
'audioboo:channel', webpage, 'uploader url')
return {
'id': video_id,
'url': audio_url,
'title': title,
'description': description,
'duration': duration,
'uploader': uploader,
'uploader_url': uploader_url,
}

View File

@@ -0,0 +1,145 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import time
from .common import InfoExtractor
from .soundcloud import SoundcloudIE
from ..compat import compat_str
from ..utils import (
ExtractorError,
url_basename,
)
class AudiomackIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)'
IE_NAME = 'audiomack'
_TESTS = [
# hosted on audiomack
{
'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary',
'info_dict':
{
'id': '310086',
'ext': 'mp3',
'uploader': 'Roosh Williams',
'title': 'Extraordinary'
}
},
# audiomack wrapper around soundcloud song
{
'add_ie': ['Soundcloud'],
'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle',
'info_dict': {
'id': '258901379',
'ext': 'mp3',
'description': 'mamba day freestyle for the legend Kobe Bryant ',
'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]',
'uploader': 'ILOVEMAKONNEN',
'upload_date': '20160414',
}
},
]
def _real_extract(self, url):
# URLs end with [uploader name]/[uploader title]
# this title is whatever the user types in, and is rarely
# the proper song title. Real metadata is in the api response
album_url_tag = self._match_id(url)
# Request the extended version of the api for extra fields like artist and title
api_response = self._download_json(
'http://www.audiomack.com/api/music/url/song/%s?extended=1&_=%d' % (
album_url_tag, time.time()),
album_url_tag)
# API is inconsistent with errors
if 'url' not in api_response or not api_response['url'] or 'error' in api_response:
raise ExtractorError('Invalid url %s' % url)
# Audiomack wraps a lot of soundcloud tracks in their branded wrapper
# if so, pass the work off to the soundcloud extractor
if SoundcloudIE.suitable(api_response['url']):
return self.url_result(api_response['url'], SoundcloudIE.ie_key())
return {
'id': compat_str(api_response.get('id', album_url_tag)),
'uploader': api_response.get('artist'),
'title': api_response.get('title'),
'url': api_response['url'],
}
class AudiomackAlbumIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P<id>[\w/-]+)'
IE_NAME = 'audiomack:album'
_TESTS = [
# Standard album playlist
{
'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape',
'playlist_count': 15,
'info_dict':
{
'id': '812251',
'title': 'Tha Tour: Part 2 (Official Mixtape)'
}
},
# Album playlist ripped from fakeshoredrive with no metadata
{
'url': 'http://www.audiomack.com/album/fakeshoredrive/ppp-pistol-p-project',
'info_dict': {
'title': 'PPP (Pistol P Project)',
'id': '837572',
},
'playlist': [{
'info_dict': {
'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)',
'id': '837577',
'ext': 'mp3',
'uploader': 'Lil Herb a.k.a. G Herbo',
}
}],
'params': {
'playliststart': 9,
'playlistend': 9,
}
}
]
def _real_extract(self, url):
# URLs end with [uploader name]/[uploader title]
# this title is whatever the user types in, and is rarely
# the proper song title. Real metadata is in the api response
album_url_tag = self._match_id(url)
result = {'_type': 'playlist', 'entries': []}
# There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata
# Therefore we don't know how many songs the album has and must infi-loop until failure
for track_no in itertools.count():
# Get song's metadata
api_response = self._download_json(
'http://www.audiomack.com/api/music/url/album/%s/%d?extended=1&_=%d'
% (album_url_tag, track_no, time.time()), album_url_tag,
note='Querying song information (%d)' % (track_no + 1))
# Total failure, only occurs when url is totally wrong
# Won't happen in middle of valid playlist (next case)
if 'url' not in api_response or 'error' in api_response:
raise ExtractorError('Invalid url for track %d of album url %s' % (track_no, url))
# URL is good but song id doesn't exist - usually means end of playlist
elif not api_response['url']:
break
else:
# Pull out the album metadata and add to result (if it exists)
for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]:
if apikey in api_response and resultkey not in result:
result[resultkey] = api_response[apikey]
song_id = url_basename(api_response['url']).rpartition('.')[0]
result['entries'].append({
'id': compat_str(api_response.get('id', song_id)),
'uploader': api_response.get('artist'),
'title': api_response.get('title', song_id),
'url': api_response['url'],
})
return result

View File

@@ -0,0 +1,185 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import base64
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_urlencode,
compat_str,
)
from ..utils import (
int_or_none,
parse_iso8601,
smuggle_url,
unsmuggle_url,
urlencode_postdata,
)
class AWAANIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
def _real_extract(self, url):
show_id, video_id, season_id = re.match(self._VALID_URL, url).groups()
if video_id and int(video_id) > 0:
return self.url_result(
'http://awaan.ae/media/%s' % video_id, 'AWAANVideo')
elif season_id and int(season_id) > 0:
return self.url_result(smuggle_url(
'http://awaan.ae/program/season/%s' % season_id,
{'show_id': show_id}), 'AWAANSeason')
else:
return self.url_result(
'http://awaan.ae/program/%s' % show_id, 'AWAANSeason')
class AWAANBaseIE(InfoExtractor):
def _parse_video_data(self, video_data, video_id, is_live):
title = video_data.get('title_en') or video_data['title_ar']
img = video_data.get('img')
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
'description': video_data.get('description_en') or video_data.get('description_ar'),
'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None,
'duration': int_or_none(video_data.get('duration')),
'timestamp': parse_iso8601(video_data.get('create_time'), ' '),
'is_live': is_live,
}
class AWAANVideoIE(AWAANBaseIE):
IE_NAME = 'awaan:video'
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375',
'md5': '5f61c33bfc7794315c671a62d43116aa',
'info_dict':
{
'id': '17375',
'ext': 'mp4',
'title': 'رحلة العمر : الحلقة 1',
'description': 'md5:0156e935d870acb8ef0a66d24070c6d6',
'duration': 2041,
'timestamp': 1227504126,
'upload_date': '20081124',
'uploader_id': '71',
},
}, {
'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id,
video_id, headers={'Origin': 'http://awaan.ae'})
info = self._parse_video_data(video_data, video_id, False)
embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + compat_urllib_parse_urlencode({
'id': video_data['id'],
'user_id': video_data['user_id'],
'signature': video_data['signature'],
'countries': 'Q0M=',
'filter': 'DENY',
})
info.update({
'_type': 'url_transparent',
'url': embed_url,
'ie_key': 'MangomoloVideo',
})
return info
class AWAANLiveIE(AWAANBaseIE):
IE_NAME = 'awaan:live'
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)'
_TEST = {
'url': 'http://awaan.ae/live/6/dubai-tv',
'info_dict': {
'id': '6',
'ext': 'mp4',
'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'upload_date': '20150107',
'timestamp': 1420588800,
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
channel_id = self._match_id(url)
channel_data = self._download_json(
'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id,
channel_id, headers={'Origin': 'http://awaan.ae'})
info = self._parse_video_data(channel_data, channel_id, True)
embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + compat_urllib_parse_urlencode({
'id': base64.b64encode(channel_data['user_id'].encode()).decode(),
'channelid': base64.b64encode(channel_data['id'].encode()).decode(),
'signature': channel_data['signature'],
'countries': 'Q0M=',
'filter': 'DENY',
})
info.update({
'_type': 'url_transparent',
'url': embed_url,
'ie_key': 'MangomoloLive',
})
return info
class AWAANSeasonIE(InfoExtractor):
IE_NAME = 'awaan:season'
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
_TEST = {
'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A',
'info_dict':
{
'id': '7910',
'title': 'محاضرات الشيخ الشعراوي',
},
'playlist_mincount': 27,
}
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
show_id, season_id = re.match(self._VALID_URL, url).groups()
data = {}
if season_id:
data['season'] = season_id
show_id = smuggled_data.get('show_id')
if show_id is None:
season = self._download_json(
'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id,
season_id, headers={'Origin': 'http://awaan.ae'})
show_id = season['id']
data['show_id'] = show_id
show = self._download_json(
'http://admin.mangomolo.com/analytics/index.php/plus/show',
show_id, data=urlencode_postdata(data), headers={
'Origin': 'http://awaan.ae',
'Content-Type': 'application/x-www-form-urlencoded'
})
if not season_id:
season_id = show['default_season']
for season in show['seasons']:
if season['id'] == season_id:
title = season.get('title_en') or season['title_ar']
entries = []
for video in show['videos']:
video_id = compat_str(video['id'])
entries.append(self.url_result(
'http://awaan.ae/media/%s' % video_id, 'AWAANVideo', video_id))
return self.playlist_result(entries, season_id, title)

View File

@@ -0,0 +1,78 @@
# coding: utf-8
from __future__ import unicode_literals
import datetime
import hashlib
import hmac
from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlencode
class AWSIE(InfoExtractor):
_AWS_ALGORITHM = 'AWS4-HMAC-SHA256'
_AWS_REGION = 'us-east-1'
def _aws_execute_api(self, aws_dict, video_id, query=None):
query = query or {}
amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
date = amz_date[:8]
headers = {
'Accept': 'application/json',
'Host': self._AWS_PROXY_HOST,
'X-Amz-Date': amz_date,
'X-Api-Key': self._AWS_API_KEY
}
session_token = aws_dict.get('session_token')
if session_token:
headers['X-Amz-Security-Token'] = session_token
def aws_hash(s):
return hashlib.sha256(s.encode('utf-8')).hexdigest()
# Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html
canonical_querystring = compat_urllib_parse_urlencode(query)
canonical_headers = ''
for header_name, header_value in sorted(headers.items()):
canonical_headers += '%s:%s\n' % (header_name.lower(), header_value)
signed_headers = ';'.join([header.lower() for header in sorted(headers.keys())])
canonical_request = '\n'.join([
'GET',
aws_dict['uri'],
canonical_querystring,
canonical_headers,
signed_headers,
aws_hash('')
])
# Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html
credential_scope_list = [date, self._AWS_REGION, 'execute-api', 'aws4_request']
credential_scope = '/'.join(credential_scope_list)
string_to_sign = '\n'.join([self._AWS_ALGORITHM, amz_date, credential_scope, aws_hash(canonical_request)])
# Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html
def aws_hmac(key, msg):
return hmac.new(key, msg.encode('utf-8'), hashlib.sha256)
def aws_hmac_digest(key, msg):
return aws_hmac(key, msg).digest()
def aws_hmac_hexdigest(key, msg):
return aws_hmac(key, msg).hexdigest()
k_signing = ('AWS4' + aws_dict['secret_key']).encode('utf-8')
for value in credential_scope_list:
k_signing = aws_hmac_digest(k_signing, value)
signature = aws_hmac_hexdigest(k_signing, string_to_sign)
# Task 4: http://docs.aws.amazon.com/general/latest/gr/sigv4-add-signature-to-request.html
headers['Authorization'] = ', '.join([
'%s Credential=%s/%s' % (self._AWS_ALGORITHM, aws_dict['access_key'], credential_scope),
'SignedHeaders=%s' % signed_headers,
'Signature=%s' % signature,
])
return self._download_json(
'https://%s%s%s' % (self._AWS_PROXY_HOST, aws_dict['uri'], '?' + canonical_querystring if canonical_querystring else ''),
video_id, headers=headers)

View File

@@ -0,0 +1,66 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from .kaltura import KalturaIE
class AZMedienIE(InfoExtractor):
IE_DESC = 'AZ Medien videos'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?
(?P<host>
telezueri\.ch|
telebaern\.tv|
telem1\.ch
)/
[^/]+/
(?P<id>
[^/]+-(?P<article_id>\d+)
)
(?:
\#video=
(?P<kaltura_id>
[_0-9a-z]+
)
)?
'''
_TESTS = [{
'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569',
'info_dict': {
'id': '1_anruz3wy',
'ext': 'mp4',
'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen',
'uploader_id': 'TVOnline',
'upload_date': '20180930',
'timestamp': 1538328802,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1',
'only_matching': True
}]
_API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d'
_PARTNER_ID = '1719221'
def _real_extract(self, url):
host, display_id, article_id, entry_id = re.match(self._VALID_URL, url).groups()
if not entry_id:
entry_id = self._download_json(
self._API_TEMPL % (host, host.split('.')[0]), display_id, query={
'variables': json.dumps({
'contextId': 'NewsArticle:' + article_id,
}),
})['data']['context']['mainAsset']['video']['kaltura']['kalturaId']
return self.url_result(
'kaltura:%s:%s' % (self._PARTNER_ID, entry_id),
ie=KalturaIE.ie_key(), video_id=entry_id)

View File

@@ -0,0 +1,56 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import unescapeHTML
class BaiduVideoIE(InfoExtractor):
IE_DESC = '百度视频'
_VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm'
_TESTS = [{
'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6',
'info_dict': {
'id': '1069',
'title': '中华小当家 TV版国语',
'description': 'md5:51be07afe461cf99fa61231421b5397c',
},
'playlist_count': 52,
}, {
'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand',
'info_dict': {
'id': '11595',
'title': 're:^奔跑吧兄弟',
'description': 'md5:1bf88bad6d850930f542d51547c089b8',
},
'playlist_mincount': 12,
}]
def _call_api(self, path, category, playlist_id, note):
return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % (
path, category, playlist_id), playlist_id, note)
def _real_extract(self, url):
category, playlist_id = re.match(self._VALID_URL, url).groups()
if category == 'show':
category = 'tvshow'
if category == 'tv':
category = 'tvplay'
playlist_detail = self._call_api(
'xqinfo', category, playlist_id, 'Download playlist JSON metadata')
playlist_title = playlist_detail['title']
playlist_description = unescapeHTML(playlist_detail.get('intro'))
episodes_detail = self._call_api(
'xqsingle', category, playlist_id, 'Download episodes JSON metadata')
entries = [self.url_result(
episode['url'], video_title=episode['title']
) for episode in episodes_detail['videos']]
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)

View File

@@ -0,0 +1,417 @@
from __future__ import unicode_literals
import random
import re
import time
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
KNOWN_EXTENSIONS,
parse_filesize,
str_or_none,
try_get,
unescapeHTML,
update_url_query,
unified_strdate,
unified_timestamp,
url_or_none,
)
class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
_TESTS = [{
'url': 'http://youtube-dlc.bandcamp.com/track/youtube-dlc-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
'title': "youtube-dlc \"'/\\\u00e4\u21ad - youtube-dlc test song \"'/\\\u00e4\u21ad",
'duration': 9.8485,
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'md5': '853e35bf34aa1d6fe2615ae612564b36',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
'title': 'Ben Prunty - Lanius (Battle)',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Ben Prunty',
'timestamp': 1396508491,
'upload_date': '20140403',
'release_date': '20140403',
'duration': 260.877,
'track': 'Lanius (Battle)',
'track_number': 1,
'track_id': '2650410135',
'artist': 'Ben Prunty',
'album': 'FTL: Advanced Edition Soundtrack',
},
}, {
# no free download, mp3 128
'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire',
'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7',
'info_dict': {
'id': '2584466013',
'ext': 'mp3',
'title': 'Mastodon - Hail to Fire',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Mastodon',
'timestamp': 1322005399,
'upload_date': '20111122',
'release_date': '20040207',
'duration': 120.79,
'track': 'Hail to Fire',
'track_number': 5,
'track_id': '2584466013',
'artist': 'Mastodon',
'album': 'Call of the Mastodon',
},
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
webpage = self._download_webpage(url, title)
thumbnail = self._html_search_meta('og:image', webpage, default=None)
track_id = None
track = None
track_number = None
duration = None
formats = []
track_info = self._parse_json(
self._search_regex(
r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n',
webpage, 'track info', default='{}'), title)
if track_info:
file_ = track_info.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
track = track_info.get('title')
track_id = str_or_none(track_info.get('track_id') or track_info.get('id'))
track_number = int_or_none(track_info.get('track_num'))
duration = float_or_none(track_info.get('duration'))
def extract(key):
return self._search_regex(
r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key,
webpage, key, default=None, group='value')
artist = extract('artist')
album = extract('album_title')
timestamp = unified_timestamp(
extract('publish_date') or extract('album_publish_date'))
release_date = unified_strdate(extract('album_release_date'))
download_link = self._search_regex(
r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
'download link', default=None, group='url')
if download_link:
track_id = self._search_regex(
r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
webpage, 'track id')
download_webpage = self._download_webpage(
download_link, track_id, 'Downloading free downloads page')
blob = self._parse_json(
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
'blob', group='blob'),
track_id, transform_source=unescapeHTML)
info = try_get(
blob, (lambda x: x['digital_items'][0],
lambda x: x['download_items'][0]), dict)
if info:
downloads = info.get('downloads')
if isinstance(downloads, dict):
if not track:
track = info.get('title')
if not artist:
artist = info.get('artist')
if not thumbnail:
thumbnail = info.get('thumb_url')
download_formats = {}
download_formats_list = blob.get('download_formats')
if isinstance(download_formats_list, list):
for f in blob['download_formats']:
name, ext = f.get('name'), f.get('file_extension')
if all(isinstance(x, compat_str) for x in (name, ext)):
download_formats[name] = ext.strip('.')
for format_id, f in downloads.items():
format_url = f.get('url')
if not format_url:
continue
# Stat URL generation algorithm is reverse engineered from
# download_*_bundle_*.js
stat_url = update_url_query(
format_url.replace('/download/', '/statdownload/'), {
'.rand': int(time.time() * 1000 * random.random()),
})
format_id = f.get('encoding_name') or format_id
stat = self._download_json(
stat_url, track_id, 'Downloading %s JSON' % format_id,
transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
fatal=False)
if not stat:
continue
retry_url = url_or_none(stat.get('retry_url'))
if not retry_url:
continue
formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id),
'format_id': format_id,
'format_note': f.get('description'),
'filesize': parse_filesize(f.get('size_mb')),
'vcodec': 'none',
})
self._sort_formats(formats)
title = '%s - %s' % (artist, track) if artist else track
if not duration:
duration = float_or_none(self._html_search_meta(
'duration', webpage, default=None))
return {
'id': track_id,
'title': title,
'thumbnail': thumbnail,
'uploader': artist,
'timestamp': timestamp,
'release_date': release_date,
'duration': duration,
'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist,
'album': album,
'formats': formats,
}
class BandcampAlbumIE(InfoExtractor):
IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
'playlist': [
{
'md5': '39bc1eded3476e927c724321ddf116cf',
'info_dict': {
'id': '1353101989',
'ext': 'mp3',
'title': 'Intro',
}
},
{
'md5': '1a2c32e2691474643e912cc6cd4bffaa',
'info_dict': {
'id': '38097443',
'ext': 'mp3',
'title': 'Kero One - Keep It Alive (Blazo remix)',
}
},
],
'info_dict': {
'title': 'Jazz Format Mixtape vol.1',
'id': 'jazz-format-mixtape-vol-1',
'uploader_id': 'blazo',
},
'params': {
'playlistend': 2
},
'skip': 'Bandcamp imposes download limits.'
}, {
'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
'info_dict': {
'title': 'Hierophany of the Open Grave',
'uploader_id': 'nightbringer',
'id': 'hierophany-of-the-open-grave',
},
'playlist_mincount': 9,
}, {
'url': 'http://dotscale.bandcamp.com',
'info_dict': {
'title': 'Loom',
'id': 'dotscale',
'uploader_id': 'dotscale',
},
'playlist_mincount': 7,
}, {
# with escaped quote in title
'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
'info_dict': {
'title': '"Entropy" EP',
'uploader_id': 'jstrecords',
'id': 'entropy-ep',
},
'playlist_mincount': 3,
}, {
# not all tracks have songs
'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
'info_dict': {
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
},
'playlist_count': 2,
}]
@classmethod
def suitable(cls, url):
return (False
if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader_id = mobj.group('subdomain')
album_id = mobj.group('album_id')
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
track_elements = re.findall(
r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
if not track_elements:
raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
entries = [
self.url_result(
compat_urlparse.urljoin(url, t_path),
ie=BandcampIE.ie_key(),
video_title=self._search_regex(
r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
elem_content, 'track title', fatal=False))
for elem_content, t_path in track_elements
if self._html_search_meta('duration', elem_content, default=None)]
title = self._html_search_regex(
r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
webpage, 'title', fatal=False)
if title:
title = title.replace(r'\"', '"')
return {
'_type': 'playlist',
'uploader_id': uploader_id,
'id': playlist_id,
'title': title,
'entries': entries,
}
class BandcampWeeklyIE(InfoExtractor):
IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{
'url': 'https://bandcamp.com/?show=224',
'md5': 'b00df799c733cf7e0c567ed187dea0fd',
'info_dict': {
'id': '224',
'ext': 'opus',
'title': 'BC Weekly April 4th 2017 - Magic Moments',
'description': 'md5:5d48150916e8e02d030623a48512c874',
'duration': 5829.77,
'release_date': '20170404',
'series': 'Bandcamp Weekly',
'episode': 'Magic Moments',
'episode_number': 208,
'episode_id': '224',
}
}, {
'url': 'https://bandcamp.com/?blah/blah@&show=228',
'only_matching': True
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
blob = self._parse_json(
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
'blob', group='blob'),
video_id, transform_source=unescapeHTML)
show = blob['bcw_show']
# This is desired because any invalid show id redirects to `bandcamp.com`
# which happens to expose the latest Bandcamp Weekly episode.
show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
formats = []
for format_id, format_url in show['audio_stream'].items():
if not url_or_none(format_url):
continue
for known_ext in KNOWN_EXTENSIONS:
if known_ext in format_id:
ext = known_ext
break
else:
ext = None
formats.append({
'format_id': format_id,
'url': format_url,
'ext': ext,
'vcodec': 'none',
})
self._sort_formats(formats)
title = show.get('audio_title') or 'Bandcamp Weekly'
subtitle = show.get('subtitle')
if subtitle:
title += ' - %s' % subtitle
episode_number = None
seq = blob.get('bcw_seq')
if seq and isinstance(seq, list):
try:
episode_number = next(
int_or_none(e.get('episode_number'))
for e in seq
if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
except StopIteration:
pass
return {
'id': video_id,
'title': title,
'description': show.get('desc') or show.get('short_desc'),
'duration': float_or_none(show.get('audio_duration')),
'is_live': False,
'release_date': unified_strdate(show.get('published_date')),
'series': 'Bandcamp Weekly',
'episode': show.get('subtitle'),
'episode_number': episode_number,
'episode_id': compat_str(video_id),
'formats': formats
}

1359
youtube_dlc/extractor/bbc.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,194 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
compat_str,
float_or_none,
int_or_none,
parse_iso8601,
try_get,
urljoin,
)
class BeamProBaseIE(InfoExtractor):
_API_BASE = 'https://mixer.com/api/v1'
_RATINGS = {'family': 0, 'teen': 13, '18+': 18}
def _extract_channel_info(self, chan):
user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id'])
return {
'uploader': chan.get('token') or try_get(
chan, lambda x: x['user']['username'], compat_str),
'uploader_id': compat_str(user_id) if user_id else None,
'age_limit': self._RATINGS.get(chan.get('audience')),
}
class BeamProLiveIE(BeamProBaseIE):
IE_NAME = 'Mixer:live'
_VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://mixer.com/niterhayven',
'info_dict': {
'id': '261562',
'ext': 'mp4',
'title': 'Introducing The Witcher 3 // The Grind Starts Now!',
'description': 'md5:0b161ac080f15fe05d18a07adb44a74d',
'thumbnail': r're:https://.*\.jpg$',
'timestamp': 1483477281,
'upload_date': '20170103',
'uploader': 'niterhayven',
'uploader_id': '373396',
'age_limit': 18,
'is_live': True,
'view_count': int,
},
'skip': 'niterhayven is offline',
'params': {
'skip_download': True,
},
}
_MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE
@classmethod
def suitable(cls, url):
return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url)
def _real_extract(self, url):
channel_name = self._match_id(url)
chan = self._download_json(
'%s/channels/%s' % (self._API_BASE, channel_name), channel_name)
if chan.get('online') is False:
raise ExtractorError(
'{0} is offline'.format(channel_name), expected=True)
channel_id = chan['id']
def manifest_url(kind):
return self._MANIFEST_URL_TEMPLATE % (channel_id, kind)
formats = self._extract_m3u8_formats(
manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls',
fatal=False)
formats.extend(self._extract_smil_formats(
manifest_url('smil'), channel_name, fatal=False))
self._sort_formats(formats)
info = {
'id': compat_str(chan.get('id') or channel_name),
'title': self._live_title(chan.get('name') or channel_name),
'description': clean_html(chan.get('description')),
'thumbnail': try_get(
chan, lambda x: x['thumbnail']['url'], compat_str),
'timestamp': parse_iso8601(chan.get('updatedAt')),
'is_live': True,
'view_count': int_or_none(chan.get('viewersTotal')),
'formats': formats,
}
info.update(self._extract_channel_info(chan))
return info
class BeamProVodIE(BeamProBaseIE):
IE_NAME = 'Mixer:vod'
_VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>[^?#&]+)'
_TESTS = [{
'url': 'https://mixer.com/willow8714?vod=2259830',
'md5': 'b2431e6e8347dc92ebafb565d368b76b',
'info_dict': {
'id': '2259830',
'ext': 'mp4',
'title': 'willow8714\'s Channel',
'duration': 6828.15,
'thumbnail': r're:https://.*source\.png$',
'timestamp': 1494046474,
'upload_date': '20170506',
'uploader': 'willow8714',
'uploader_id': '6085379',
'age_limit': 13,
'view_count': int,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://mixer.com/streamer?vod=IxFno1rqC0S_XJ1a2yGgNw',
'only_matching': True,
}, {
'url': 'https://mixer.com/streamer?vod=Rh3LY0VAqkGpEQUe2pN-ig',
'only_matching': True,
}]
@staticmethod
def _extract_format(vod, vod_type):
if not vod.get('baseUrl'):
return []
if vod_type == 'hls':
filename, protocol = 'manifest.m3u8', 'm3u8_native'
elif vod_type == 'raw':
filename, protocol = 'source.mp4', 'https'
else:
assert False
data = vod.get('data') if isinstance(vod.get('data'), dict) else {}
format_id = [vod_type]
if isinstance(data.get('Height'), compat_str):
format_id.append('%sp' % data['Height'])
return [{
'url': urljoin(vod['baseUrl'], filename),
'format_id': '-'.join(format_id),
'ext': 'mp4',
'protocol': protocol,
'width': int_or_none(data.get('Width')),
'height': int_or_none(data.get('Height')),
'fps': int_or_none(data.get('Fps')),
'tbr': int_or_none(data.get('Bitrate'), 1000),
}]
def _real_extract(self, url):
vod_id = self._match_id(url)
vod_info = self._download_json(
'%s/recordings/%s' % (self._API_BASE, vod_id), vod_id)
state = vod_info.get('state')
if state != 'AVAILABLE':
raise ExtractorError(
'VOD %s is not available (state: %s)' % (vod_id, state),
expected=True)
formats = []
thumbnail_url = None
for vod in vod_info['vods']:
vod_type = vod.get('format')
if vod_type in ('hls', 'raw'):
formats.extend(self._extract_format(vod, vod_type))
elif vod_type == 'thumbnail':
thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png')
self._sort_formats(formats)
info = {
'id': vod_id,
'title': vod_info.get('name') or vod_id,
'duration': float_or_none(vod_info.get('duration')),
'thumbnail': thumbnail_url,
'timestamp': parse_iso8601(vod_info.get('createdAt')),
'view_count': int_or_none(vod_info.get('viewsTotal')),
'formats': formats,
}
info.update(self._extract_channel_info(vod_info.get('channel') or {}))
return info

View File

@@ -0,0 +1,103 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import int_or_none
class BeatportIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|pro\.)?beatport\.com/track/(?P<display_id>[^/]+)/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://beatport.com/track/synesthesia-original-mix/5379371',
'md5': 'b3c34d8639a2f6a7f734382358478887',
'info_dict': {
'id': '5379371',
'display_id': 'synesthesia-original-mix',
'ext': 'mp4',
'title': 'Froxic - Synesthesia (Original Mix)',
},
}, {
'url': 'https://beatport.com/track/love-and-war-original-mix/3756896',
'md5': 'e44c3025dfa38c6577fbaeb43da43514',
'info_dict': {
'id': '3756896',
'display_id': 'love-and-war-original-mix',
'ext': 'mp3',
'title': 'Wolfgang Gartner - Love & War (Original Mix)',
},
}, {
'url': 'https://beatport.com/track/birds-original-mix/4991738',
'md5': 'a1fd8e8046de3950fd039304c186c05f',
'info_dict': {
'id': '4991738',
'display_id': 'birds-original-mix',
'ext': 'mp4',
'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)",
}
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
track_id = mobj.group('id')
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
playables = self._parse_json(
self._search_regex(
r'window\.Playables\s*=\s*({.+?});', webpage,
'playables info', flags=re.DOTALL),
track_id)
track = next(t for t in playables['tracks'] if t['id'] == int(track_id))
title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name']
if track['mix']:
title += ' (' + track['mix'] + ')'
formats = []
for ext, info in track['preview'].items():
if not info['url']:
continue
fmt = {
'url': info['url'],
'ext': ext,
'format_id': ext,
'vcodec': 'none',
}
if ext == 'mp3':
fmt['preference'] = 0
fmt['acodec'] = 'mp3'
fmt['abr'] = 96
fmt['asr'] = 44100
elif ext == 'mp4':
fmt['preference'] = 1
fmt['acodec'] = 'aac'
fmt['abr'] = 96
fmt['asr'] = 44100
formats.append(fmt)
self._sort_formats(formats)
images = []
for name, info in track['images'].items():
image_url = info.get('url')
if name == 'dynamic' or not image_url:
continue
image = {
'id': name,
'url': image_url,
'height': int_or_none(info.get('height')),
'width': int_or_none(info.get('width')),
}
images.append(image)
return {
'id': compat_str(track.get('id')) or track_id,
'display_id': track.get('slug') or display_id,
'title': title,
'formats': formats,
'thumbnails': images,
}

View File

@@ -0,0 +1,116 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import (
int_or_none,
unified_timestamp,
)
class BeegIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P<id>\d+)'
_TESTS = [{
# api/v6 v1
'url': 'http://beeg.com/5416503',
'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820',
'info_dict': {
'id': '5416503',
'ext': 'mp4',
'title': 'Sultry Striptease',
'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2',
'timestamp': 1391813355,
'upload_date': '20140207',
'duration': 383,
'tags': list,
'age_limit': 18,
}
}, {
# api/v6 v2
'url': 'https://beeg.com/1941093077?t=911-1391',
'only_matching': True,
}, {
# api/v6 v2 w/o t
'url': 'https://beeg.com/1277207756',
'only_matching': True,
}, {
'url': 'https://beeg.porn/video/5416503',
'only_matching': True,
}, {
'url': 'https://beeg.porn/5416503',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
beeg_version = self._search_regex(
r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version',
default='1546225636701')
if len(video_id) >= 10:
query = {
'v': 2,
}
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
t = qs.get('t', [''])[0].split('-')
if len(t) > 1:
query.update({
's': t[0],
'e': t[1],
})
else:
query = {'v': 1}
for api_path in ('', 'api.'):
video = self._download_json(
'https://%sbeeg.com/api/v6/%s/video/%s'
% (api_path, beeg_version, video_id), video_id,
fatal=api_path == 'api.', query=query)
if video:
break
formats = []
for format_id, video_url in video.items():
if not video_url:
continue
height = self._search_regex(
r'^(\d+)[pP]$', format_id, 'height', default=None)
if not height:
continue
formats.append({
'url': self._proto_relative_url(
video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'),
'format_id': format_id,
'height': int(height),
})
self._sort_formats(formats)
title = video['title']
video_id = compat_str(video.get('id') or video_id)
display_id = video.get('code')
description = video.get('desc')
series = video.get('ps_name')
timestamp = unified_timestamp(video.get('date'))
duration = int_or_none(video.get('duration'))
tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'series': series,
'timestamp': timestamp,
'duration': duration,
'tags': tags,
'formats': formats,
'age_limit': self._rta_search(webpage),
}

View File

@@ -0,0 +1,46 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import url_basename
class BehindKinkIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)'
_TEST = {
'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/',
'md5': '507b57d8fdcd75a41a9a7bdb7989c762',
'info_dict': {
'id': '37127',
'ext': 'mp4',
'title': 'What are you passionate about Marley Blaze',
'description': 'md5:aee8e9611b4ff70186f752975d9b94b4',
'upload_date': '20141205',
'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg',
'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
video_url = self._search_regex(
r'<source src="([^"]+)"', webpage, 'video URL')
video_id = url_basename(video_url).split('_')[0]
upload_date = mobj.group('year') + mobj.group('month') + mobj.group('day')
return {
'id': video_id,
'display_id': display_id,
'url': video_url,
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
'upload_date': upload_date,
'age_limit': 18,
}

View File

@@ -0,0 +1,88 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class BellMediaIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:www\.)?
(?P<domain>
(?:
ctv|
tsn|
bnn(?:bloomberg)?|
thecomedynetwork|
discovery|
discoveryvelocity|
sciencechannel|
investigationdiscovery|
animalplanet|
bravo|
mtv|
space|
etalk|
marilyn
)\.ca|
(?:much|cp24)\.com
)/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
_TESTS = [{
'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070',
'md5': '36d3ef559cfe8af8efe15922cd3ce950',
'info_dict': {
'id': '1403070',
'ext': 'flv',
'title': 'David Cockfield\'s Top Picks',
'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3',
'upload_date': '20180525',
'timestamp': 1527288600,
},
}, {
'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582',
'only_matching': True,
}, {
'url': 'http://www.tsn.ca/video/expectations-high-for-milos-raonic-at-us-open~939549',
'only_matching': True,
}, {
'url': 'http://www.bnn.ca/video/berman-s-call-part-two-viewer-questions~939654',
'only_matching': True,
}, {
'url': 'http://www.ctv.ca/YourMorning/Video/S1E6-Monday-August-29-2016-vid938009',
'only_matching': True,
}, {
'url': 'http://www.much.com/shows/atmidnight/episode948007/tuesday-september-13-2016',
'only_matching': True,
}, {
'url': 'http://www.much.com/shows/the-almost-impossible-gameshow/928979/episode-6',
'only_matching': True,
}, {
'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430',
'only_matching': True,
}, {
'url': 'http://www.etalk.ca/video?videoid=663455',
'only_matching': True,
}, {
'url': 'https://www.cp24.com/video?clipId=1982548',
'only_matching': True,
}]
_DOMAINS = {
'thecomedynetwork': 'comedy',
'discoveryvelocity': 'discvel',
'sciencechannel': 'discsci',
'investigationdiscovery': 'invdisc',
'animalplanet': 'aniplan',
'etalk': 'ctv',
'bnnbloomberg': 'bnn',
'marilyn': 'ctv_marilyn',
}
def _real_extract(self, url):
domain, video_id = re.match(self._VALID_URL, url).groups()
domain = domain.split('.')[0]
return {
'_type': 'url_transparent',
'id': video_id,
'url': '9c9media:%s_web:%s' % (self._DOMAINS.get(domain, domain), video_id),
'ie_key': 'NineCNineMedia',
}

View File

@@ -0,0 +1,80 @@
from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
from ..utils import unified_strdate
class BetIE(MTVServicesInfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html'
_TESTS = [
{
'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
'info_dict': {
'id': '07e96bd3-8850-3051-b856-271b457f0ab8',
'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
'ext': 'flv',
'title': 'A Conversation With President Obama',
'description': 'President Obama urges persistence in confronting racism and bias.',
'duration': 1534,
'upload_date': '20141208',
'thumbnail': r're:(?i)^https?://.*\.jpg$',
'subtitles': {
'en': 'mincount:2',
}
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
'info_dict': {
'id': '9f516bf1-7543-39c4-8076-dd441b459ba9',
'display_id': 'justice-for-ferguson-a-community-reacts',
'ext': 'flv',
'title': 'Justice for Ferguson: A Community Reacts',
'description': 'A BET News special.',
'duration': 1696,
'upload_date': '20141125',
'thumbnail': r're:(?i)^https?://.*\.jpg$',
'subtitles': {
'en': 'mincount:2',
}
},
'params': {
# rtmp download
'skip_download': True,
},
}
]
_FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player"
def _get_feed_query(self, uri):
return {
'uuid': uri,
}
def _extract_mgid(self, webpage):
return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid')
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
mgid = self._extract_mgid(webpage)
videos_info = self._get_videos_info(mgid)
info_dict = videos_info['entries'][0]
upload_date = unified_strdate(self._html_search_meta('date', webpage))
description = self._html_search_meta('description', webpage)
info_dict.update({
'display_id': display_id,
'description': description,
'upload_date': upload_date,
})
return info_dict

View File

@@ -0,0 +1,37 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import extract_attributes
class BFIPlayerIE(InfoExtractor):
IE_NAME = 'bfi:player'
_VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P<id>[\w-]+)-online'
_TEST = {
'url': 'https://player.bfi.org.uk/free/film/watch-computer-doctor-1974-online',
'md5': 'e8783ebd8e061ec4bc6e9501ed547de8',
'info_dict': {
'id': 'htNnhlZjE60C9VySkQEIBtU-cNV1Xx63',
'ext': 'mp4',
'title': 'Computer Doctor',
'description': 'md5:fb6c240d40c4dbe40428bdd62f78203b',
},
'skip': 'BFI Player films cannot be played outside of the UK',
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
entries = []
for player_el in re.findall(r'(?s)<[^>]+class="player"[^>]*>', webpage):
player_attr = extract_attributes(player_el)
ooyala_id = player_attr.get('data-video-id')
if not ooyala_id:
continue
entries.append(self.url_result(
'ooyala:' + ooyala_id, 'Ooyala',
ooyala_id, player_attr.get('data-label')))
return self.playlist_result(entries)

View File

@@ -0,0 +1,78 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_b64decode,
compat_urllib_parse_unquote,
)
class BigflixIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)'
_TESTS = [{
# 2 formats
'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070',
'info_dict': {
'id': '16070',
'ext': 'mp4',
'title': 'Madarasapatinam',
'description': 'md5:9f0470b26a4ba8e824c823b5d95c2f6b',
'formats': 'mincount:2',
},
'params': {
'skip_download': True,
}
}, {
# multiple formats
'url': 'http://www.bigflix.com/Malayalam-movies/Drama-movies/Indian-Rupee/15967',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<div[^>]+class=["\']pagetitle["\'][^>]*>(.+?)</div>',
webpage, 'title')
def decode_url(quoted_b64_url):
return compat_b64decode(compat_urllib_parse_unquote(
quoted_b64_url)).decode('utf-8')
formats = []
for height, encoded_url in re.findall(
r'ContentURL_(\d{3,4})[pP][^=]+=([^&]+)', webpage):
video_url = decode_url(encoded_url)
f = {
'url': video_url,
'format_id': '%sp' % height,
'height': int(height),
}
if video_url.startswith('rtmp'):
f['ext'] = 'flv'
formats.append(f)
file_url = self._search_regex(
r'file=([^&]+)', webpage, 'video url', default=None)
if file_url:
video_url = decode_url(file_url)
if all(f['url'] != video_url for f in formats):
formats.append({
'url': decode_url(file_url),
})
self._sort_formats(formats)
description = self._html_search_meta('description', webpage)
return {
'id': video_id,
'title': title,
'description': description,
'formats': formats
}

View File

@@ -0,0 +1,40 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
unescapeHTML,
)
class BildIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html'
IE_DESC = 'Bild.de'
_TEST = {
'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html',
'md5': 'dd495cbd99f2413502a1713a1156ac8a',
'info_dict': {
'id': '38184146',
'ext': 'mp4',
'title': 'Das können die neuen iPads',
'description': 'md5:a4058c4fa2a804ab59c00d7244bbf62f',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 196,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
url.split('.bild.html')[0] + ',view=json.bild.html', video_id)
return {
'id': video_id,
'title': unescapeHTML(video_data['title']).strip(),
'description': unescapeHTML(video_data.get('description')),
'url': video_data['clipList'][0]['srces'][0]['src'],
'thumbnail': video_data.get('poster'),
'duration': int_or_none(video_data.get('durationSec')),
}

View File

@@ -0,0 +1,450 @@
# coding: utf-8
from __future__ import unicode_literals
import hashlib
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urlparse,
)
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
parse_iso8601,
smuggle_url,
str_or_none,
strip_jsonp,
unified_timestamp,
unsmuggle_url,
urlencode_postdata,
)
class BiliBiliIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:(?:www|bangumi)\.)?
bilibili\.(?:tv|com)/
(?:
(?:
video/[aA][vV]|
anime/(?P<anime_id>\d+)/play\#
)(?P<id_bv>\d+)|
video/[bB][vV](?P<id>[^/?#&]+)
)
'''
_TESTS = [{
'url': 'http://www.bilibili.tv/video/av1074402/',
'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
'info_dict': {
'id': '1074402',
'ext': 'flv',
'title': '【金坷垃】金泡沫',
'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
'duration': 308.067,
'timestamp': 1398012678,
'upload_date': '20140420',
'thumbnail': r're:^https?://.+\.jpg',
'uploader': '菊子桑',
'uploader_id': '156160',
},
}, {
# Tested in BiliBiliBangumiIE
'url': 'http://bangumi.bilibili.com/anime/1869/play#40062',
'only_matching': True,
}, {
'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
'md5': '3f721ad1e75030cc06faf73587cfec57',
'info_dict': {
'id': '100643',
'ext': 'mp4',
'title': 'CHAOS;CHILD',
'description': '如果你是神明并且能够让妄想成为现实。那你会进行怎么样的妄想是淫靡的世界独裁社会毁灭性的制裁还是……2015年涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
},
'skip': 'Geo-restricted to China',
}, {
# Title with double quotes
'url': 'http://www.bilibili.com/video/av8903802/',
'info_dict': {
'id': '8903802',
'title': '阿滴英文|英文歌分享#6 "Closer',
'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
},
'playlist': [{
'info_dict': {
'id': '8903802_part1',
'ext': 'flv',
'title': '阿滴英文|英文歌分享#6 "Closer',
'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
'uploader': '阿滴英文',
'uploader_id': '65880958',
'timestamp': 1488382634,
'upload_date': '20170301',
},
'params': {
'skip_download': True, # Test metadata only
},
}, {
'info_dict': {
'id': '8903802_part2',
'ext': 'flv',
'title': '阿滴英文|英文歌分享#6 "Closer',
'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
'uploader': '阿滴英文',
'uploader_id': '65880958',
'timestamp': 1488382634,
'upload_date': '20170301',
},
'params': {
'skip_download': True, # Test metadata only
},
}]
}, {
# new BV video id format
'url': 'https://www.bilibili.com/video/BV1JE411F741',
'only_matching': True,
}]
_APP_KEY = 'iVGUTjsxvpLeuDCf'
_BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
def _report_error(self, result):
if 'message' in result:
raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True)
elif 'code' in result:
raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True)
else:
raise ExtractorError('Can\'t extract Bangumi episode ID')
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') or mobj.group('id_bv')
anime_id = mobj.group('anime_id')
webpage = self._download_webpage(url, video_id)
if 'anime/' not in url:
cid = self._search_regex(
r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
default=None
) or compat_parse_qs(self._search_regex(
[r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)',
r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
webpage, 'player parameters'))['cid'][0]
else:
if 'no_bangumi_tip' not in smuggled_data:
self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dlc with %s' % (
video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id)))
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': url
}
headers.update(self.geo_verification_headers())
js = self._download_json(
'http://bangumi.bilibili.com/web_api/get_source', video_id,
data=urlencode_postdata({'episode_id': video_id}),
headers=headers)
if 'result' not in js:
self._report_error(js)
cid = js['result']['cid']
headers = {
'Referer': url
}
headers.update(self.geo_verification_headers())
entries = []
RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
for num, rendition in enumerate(RENDITIONS, start=1):
payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
video_info = self._download_json(
'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
video_id, note='Downloading video info page',
headers=headers, fatal=num == len(RENDITIONS))
if not video_info:
continue
if 'durl' not in video_info:
if num < len(RENDITIONS):
continue
self._report_error(video_info)
for idx, durl in enumerate(video_info['durl']):
formats = [{
'url': durl['url'],
'filesize': int_or_none(durl['size']),
}]
for backup_url in durl.get('backup_url', []):
formats.append({
'url': backup_url,
# backup URLs have lower priorities
'preference': -2 if 'hd.mp4' in backup_url else -3,
})
for a_format in formats:
a_format.setdefault('http_headers', {}).update({
'Referer': url,
})
self._sort_formats(formats)
entries.append({
'id': '%s_part%s' % (video_id, idx),
'duration': float_or_none(durl.get('length'), 1000),
'formats': formats,
})
break
title = self._html_search_regex(
('<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
group='title')
description = self._html_search_meta('description', webpage)
timestamp = unified_timestamp(self._html_search_regex(
r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
default=None) or self._html_search_meta(
'uploadDate', webpage, 'timestamp', default=None))
thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
# TODO 'view_count' requires deobfuscating Javascript
info = {
'id': video_id,
'title': title,
'description': description,
'timestamp': timestamp,
'thumbnail': thumbnail,
'duration': float_or_none(video_info.get('timelength'), scale=1000),
}
uploader_mobj = re.search(
r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)',
webpage)
if uploader_mobj:
info.update({
'uploader': uploader_mobj.group('name'),
'uploader_id': uploader_mobj.group('id'),
})
if not info.get('uploader'):
info['uploader'] = self._html_search_meta(
'author', webpage, 'uploader', default=None)
for entry in entries:
entry.update(info)
if len(entries) == 1:
return entries[0]
else:
for idx, entry in enumerate(entries):
entry['id'] = '%s_part%d' % (video_id, (idx + 1))
return {
'_type': 'multi_video',
'id': video_id,
'title': title,
'description': description,
'entries': entries,
}
class BiliBiliBangumiIE(InfoExtractor):
_VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)'
IE_NAME = 'bangumi.bilibili.com'
IE_DESC = 'BiliBili番剧'
_TESTS = [{
'url': 'http://bangumi.bilibili.com/anime/1869',
'info_dict': {
'id': '1869',
'title': '混沌武士',
'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
},
'playlist_count': 26,
}, {
'url': 'http://bangumi.bilibili.com/anime/1869',
'info_dict': {
'id': '1869',
'title': '混沌武士',
'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
},
'playlist': [{
'md5': '91da8621454dd58316851c27c68b0c13',
'info_dict': {
'id': '40062',
'ext': 'mp4',
'title': '混沌武士',
'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...',
'timestamp': 1414538739,
'upload_date': '20141028',
'episode': '疾风怒涛 Tempestuous Temperaments',
'episode_number': 1,
},
}],
'params': {
'playlist_items': '1',
},
}]
@classmethod
def suitable(cls, url):
return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url)
def _real_extract(self, url):
bangumi_id = self._match_id(url)
# Sometimes this API returns a JSONP response
season_info = self._download_json(
'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id,
bangumi_id, transform_source=strip_jsonp)['result']
entries = [{
'_type': 'url_transparent',
'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}),
'ie_key': BiliBiliIE.ie_key(),
'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '),
'episode': episode.get('index_title'),
'episode_number': int_or_none(episode.get('index')),
} for episode in season_info['episodes']]
entries = sorted(entries, key=lambda entry: entry.get('episode_number'))
return self.playlist_result(
entries, bangumi_id,
season_info.get('bangumi_title'), season_info.get('evaluate'))
class BilibiliAudioBaseIE(InfoExtractor):
def _call_api(self, path, sid, query=None):
if not query:
query = {'sid': sid}
return self._download_json(
'https://www.bilibili.com/audio/music-service-c/web/' + path,
sid, query=query)['data']
class BilibiliAudioIE(BilibiliAudioBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)'
_TEST = {
'url': 'https://www.bilibili.com/audio/au1003142',
'md5': 'fec4987014ec94ef9e666d4d158ad03b',
'info_dict': {
'id': '1003142',
'ext': 'm4a',
'title': '【tsukimi】YELLOW / 神山羊',
'artist': 'tsukimi',
'comment_count': int,
'description': 'YELLOW的mp3版',
'duration': 183,
'subtitles': {
'origin': [{
'ext': 'lrc',
}],
},
'thumbnail': r're:^https?://.+\.jpg',
'timestamp': 1564836614,
'upload_date': '20190803',
'uploader': 'tsukimi-つきみぐー',
'view_count': int,
},
}
def _real_extract(self, url):
au_id = self._match_id(url)
play_data = self._call_api('url', au_id)
formats = [{
'url': play_data['cdns'][0],
'filesize': int_or_none(play_data.get('size')),
}]
song = self._call_api('song/info', au_id)
title = song['title']
statistic = song.get('statistic') or {}
subtitles = None
lyric = song.get('lyric')
if lyric:
subtitles = {
'origin': [{
'url': lyric,
}]
}
return {
'id': au_id,
'title': title,
'formats': formats,
'artist': song.get('author'),
'comment_count': int_or_none(statistic.get('comment')),
'description': song.get('intro'),
'duration': int_or_none(song.get('duration')),
'subtitles': subtitles,
'thumbnail': song.get('cover'),
'timestamp': int_or_none(song.get('passtime')),
'uploader': song.get('uname'),
'view_count': int_or_none(statistic.get('play')),
}
class BilibiliAudioAlbumIE(BilibiliAudioBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)'
_TEST = {
'url': 'https://www.bilibili.com/audio/am10624',
'info_dict': {
'id': '10624',
'title': '每日新曲推荐每日11:00更新',
'description': '每天11:00更新为你推送最新音乐',
},
'playlist_count': 19,
}
def _real_extract(self, url):
am_id = self._match_id(url)
songs = self._call_api(
'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data']
entries = []
for song in songs:
sid = str_or_none(song.get('id'))
if not sid:
continue
entries.append(self.url_result(
'https://www.bilibili.com/audio/au' + sid,
BilibiliAudioIE.ie_key(), sid))
if entries:
album_data = self._call_api('menu/info', am_id) or {}
album_title = album_data.get('title')
if album_title:
for entry in entries:
entry['album'] = album_title
return self.playlist_result(
entries, am_id, album_title, album_data.get('intro'))
return self.playlist_result(entries, am_id)
class BiliBiliPlayerIE(InfoExtractor):
_VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)'
_TEST = {
'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1',
'only_matching': True,
}
def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
'http://www.bilibili.tv/video/av%s/' % video_id,
ie=BiliBiliIE.ie_key(), video_id=video_id)

View File

@@ -0,0 +1,86 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
remove_end,
)
class BioBioChileTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:tv|www)\.biobiochile\.cl/(?:notas|noticias)/(?:[^/]+/)+(?P<id>[^/]+)\.shtml'
_TESTS = [{
'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml',
'md5': '26f51f03cf580265defefb4518faec09',
'info_dict': {
'id': 'sobre-camaras-y-camarillas-parlamentarias',
'ext': 'mp4',
'title': 'Sobre Cámaras y camarillas parlamentarias',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Fernando Atria',
},
'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html',
}, {
# different uploader layout
'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml',
'md5': 'edc2e6b58974c46d5b047dea3c539ff3',
'info_dict': {
'id': 'natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades',
'ext': 'mp4',
'title': 'Natalia Valdebenito repasa a diputado Hasbún: Pasó a la categoría de hablar brutalidades',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Piangella Obrador',
},
'params': {
'skip_download': True,
},
'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html',
}, {
'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml',
'info_dict': {
'id': 'b4xd0LK3SK',
'ext': 'mp4',
# TODO: fix url_transparent information overriding
# 'uploader': 'Juan Pablo Echenique',
'title': 'Comentario Oscar Cáceres',
},
'params': {
# empty m3u8 manifest
'skip_download': True,
},
}, {
'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml',
'only_matching': True,
}, {
'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
rudo_url = self._search_regex(
r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)',
webpage, 'embed URL', None, group='url')
if not rudo_url:
raise ExtractorError('No videos found')
title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV')
thumbnail = self._og_search_thumbnail(webpage)
uploader = self._html_search_regex(
r'<a[^>]+href=["\'](?:https?://(?:busca|www)\.biobiochile\.cl)?/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>',
webpage, 'uploader', fatal=False)
return {
'_type': 'url_transparent',
'url': rudo_url,
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'uploader': uploader,
}

View File

@@ -0,0 +1,105 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .vk import VKIE
from ..compat import (
compat_b64decode,
compat_urllib_parse_unquote,
)
from ..utils import int_or_none
class BIQLEIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
_TESTS = [{
# Youtube embed
'url': 'https://biqle.ru/watch/-115995369_456239081',
'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06',
'info_dict': {
'id': '8v4f-avW-VI',
'ext': 'mp4',
'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer",
'description': 'Passe-Partout',
'uploader_id': 'mrsimpsonstef3',
'uploader': 'Phanolito',
'upload_date': '20120822',
},
}, {
'url': 'http://biqle.org/watch/-44781847_168547604',
'md5': '7f24e72af1db0edf7c1aaba513174f97',
'info_dict': {
'id': '-44781847_168547604',
'ext': 'mp4',
'title': 'Ребенок в шоке от автоматической мойки',
'timestamp': 1396633454,
'uploader': 'Dmitry Kotov',
'upload_date': '20140404',
'uploader_id': '47850140',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
embed_url = self._proto_relative_url(self._search_regex(
r'<iframe.+?src="((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^"]+)".*?></iframe>',
webpage, 'embed url'))
if VKIE.suitable(embed_url):
return self.url_result(embed_url, VKIE.ie_key(), video_id)
embed_page = self._download_webpage(
embed_url, video_id, headers={'Referer': url})
video_ext = self._get_cookies(embed_url).get('video_ext')
if video_ext:
video_ext = compat_urllib_parse_unquote(video_ext.value)
if not video_ext:
video_ext = compat_b64decode(self._search_regex(
r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)',
embed_page, 'video_ext')).decode()
video_id, sig, _, access_token = video_ext.split(':')
item = self._download_json(
'https://api.vk.com/method/video.get', video_id,
headers={'User-Agent': 'okhttp/3.4.1'}, query={
'access_token': access_token,
'sig': sig,
'v': 5.44,
'videos': video_id,
})['response']['items'][0]
title = item['title']
formats = []
for f_id, f_url in item.get('files', {}).items():
if f_id == 'external':
return self.url_result(f_url)
ext, height = f_id.split('_')
formats.append({
'format_id': height + 'p',
'url': f_url,
'height': int_or_none(height),
'ext': ext,
})
self._sort_formats(formats)
thumbnails = []
for k, v in item.items():
if k.startswith('photo_') and v:
width = k.replace('photo_', '')
thumbnails.append({
'id': width,
'url': v,
'width': int_or_none(width),
})
return {
'id': video_id,
'title': title,
'formats': formats,
'comment_count': int_or_none(item.get('comments')),
'description': item.get('description'),
'duration': int_or_none(item.get('duration')),
'thumbnails': thumbnails,
'timestamp': int_or_none(item.get('date')),
'uploader': item.get('owner_id'),
'view_count': int_or_none(item.get('views')),
}

View File

@@ -0,0 +1,142 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
from .common import InfoExtractor
from ..utils import (
orderedSet,
unified_strdate,
urlencode_postdata,
)
class BitChuteIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bitchute.com/video/szoMrox2JEI/',
'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb',
'info_dict': {
'id': 'szoMrox2JEI',
'ext': 'mp4',
'title': 'Fuck bitches get money',
'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Victoria X Rave',
'upload_date': '20170813',
},
}, {
'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
'only_matching': True,
}, {
'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'https://www.bitchute.com/video/%s' % video_id, video_id, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
})
title = self._html_search_regex(
(r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'),
webpage, 'title', default=None) or self._html_search_meta(
'description', webpage, 'title',
default=None) or self._og_search_description(webpage)
format_urls = []
for mobj in re.finditer(
r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
format_urls.append(mobj.group('url'))
format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage))
formats = [
{'url': format_url}
for format_url in orderedSet(format_urls)]
if not formats:
formats = self._parse_html5_media_entries(
url, webpage, video_id)[0]['formats']
self._check_formats(formats, video_id)
self._sort_formats(formats)
description = self._html_search_regex(
r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>',
webpage, 'description', fatal=False)
thumbnail = self._og_search_thumbnail(
webpage, default=None) or self._html_search_meta(
'twitter:image:src', webpage, 'thumbnail')
uploader = self._html_search_regex(
(r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>',
r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'),
webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._search_regex(
r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.',
webpage, 'upload date', fatal=False))
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'upload_date': upload_date,
'formats': formats,
}
class BitChuteChannelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://www.bitchute.com/channel/victoriaxrave/',
'playlist_mincount': 185,
'info_dict': {
'id': 'victoriaxrave',
},
}
_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
def _entries(self, channel_id):
channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
offset = 0
for page_num in itertools.count(1):
data = self._download_json(
'%sextend/' % channel_url, channel_id,
'Downloading channel page %d' % page_num,
data=urlencode_postdata({
'csrfmiddlewaretoken': self._TOKEN,
'name': '',
'offset': offset,
}), headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': channel_url,
'X-Requested-With': 'XMLHttpRequest',
'Cookie': 'csrftoken=%s' % self._TOKEN,
})
if data.get('success') is False:
break
html = data.get('html')
if not html:
break
video_ids = re.findall(
r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
html)
if not video_ids:
break
offset += len(video_ids)
for video_id in video_ids:
yield self.url_result(
'https://www.bitchute.com/video/%s' % video_id,
ie=BitChuteIE.ie_key(), video_id=video_id)
def _real_extract(self, url):
channel_id = self._match_id(url)
return self.playlist_result(
self._entries(channel_id), playlist_id=channel_id)

View File

@@ -0,0 +1,106 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .amp import AMPIE
from ..utils import (
ExtractorError,
int_or_none,
parse_iso8601,
)
class BleacherReportIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)'
_TESTS = [{
'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football',
'md5': 'a3ffc3dc73afdbc2010f02d98f990f20',
'info_dict': {
'id': '2496438',
'ext': 'mp4',
'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?',
'uploader_id': 3992341,
'description': 'CFB, ACC, Florida State',
'timestamp': 1434380212,
'upload_date': '20150615',
'uploader': 'Team Stream Now ',
},
'add_ie': ['Ooyala'],
}, {
'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo',
'md5': '6a5cd403418c7b01719248ca97fb0692',
'info_dict': {
'id': '2586817',
'ext': 'webm',
'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
'timestamp': 1446839961,
'uploader': 'Sean Fay',
'description': 'md5:b1601e2314c4d8eec23b6eafe086a757',
'uploader_id': 6466954,
'upload_date': '20151011',
},
'add_ie': ['Youtube'],
}]
def _real_extract(self, url):
article_id = self._match_id(url)
article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article']
thumbnails = []
primary_photo = article_data.get('primaryPhoto')
if primary_photo:
thumbnails = [{
'url': primary_photo['url'],
'width': primary_photo.get('width'),
'height': primary_photo.get('height'),
}]
info = {
'_type': 'url_transparent',
'id': article_id,
'title': article_data['title'],
'uploader': article_data.get('author', {}).get('name'),
'uploader_id': article_data.get('authorId'),
'timestamp': parse_iso8601(article_data.get('createdAt')),
'thumbnails': thumbnails,
'comment_count': int_or_none(article_data.get('commentsCount')),
'view_count': int_or_none(article_data.get('hitCount')),
}
video = article_data.get('video')
if video:
video_type = video['type']
if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'):
info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id']
elif video_type == 'ooyala.com':
info['url'] = 'ooyala:%s' % video['id']
elif video_type == 'youtube.com':
info['url'] = video['id']
elif video_type == 'vine.co':
info['url'] = 'https://vine.co/v/%s' % video['id']
else:
info['url'] = video_type + video['id']
return info
else:
raise ExtractorError('no video in the article', expected=True)
class BleacherReportCMSIE(AMPIE):
_VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})'
_TESTS = [{
'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms',
'md5': '2e4b0a997f9228ffa31fada5c53d1ed1',
'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
'ext': 'flv',
'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
info = self._extract_feed_info('http://vid.bleacherreport.com/videos/%s.akamai' % video_id)
info['id'] = video_id
return info

View File

@@ -0,0 +1,86 @@
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..utils import (
remove_start,
int_or_none,
)
class BlinkxIE(InfoExtractor):
_VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
IE_NAME = 'blinkx'
_TEST = {
'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ',
'md5': '337cf7a344663ec79bf93a526a2e06c7',
'info_dict': {
'id': 'Da0Gw3xc',
'ext': 'mp4',
'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News',
'uploader': 'IGN News',
'upload_date': '20150217',
'timestamp': 1424215740,
'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.',
'duration': 47.743333,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
display_id = video_id[:8]
api_url = ('https://apib4.blinkx.com/api.php?action=play_video&'
+ 'video=%s' % video_id)
data_json = self._download_webpage(api_url, display_id)
data = json.loads(data_json)['api']['results'][0]
duration = None
thumbnails = []
formats = []
for m in data['media']:
if m['type'] == 'jpg':
thumbnails.append({
'url': m['link'],
'width': int(m['w']),
'height': int(m['h']),
})
elif m['type'] == 'original':
duration = float(m['d'])
elif m['type'] == 'youtube':
yt_id = m['link']
self.to_screen('Youtube video detected: %s' % yt_id)
return self.url_result(yt_id, 'Youtube', video_id=yt_id)
elif m['type'] in ('flv', 'mp4'):
vcodec = remove_start(m['vcodec'], 'ff')
acodec = remove_start(m['acodec'], 'ff')
vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000)
abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000)
tbr = vbr + abr if vbr and abr else None
format_id = '%s-%sk-%s' % (vcodec, tbr, m['w'])
formats.append({
'format_id': format_id,
'url': m['link'],
'vcodec': vcodec,
'acodec': acodec,
'abr': abr,
'vbr': vbr,
'tbr': tbr,
'width': int_or_none(m.get('w')),
'height': int_or_none(m.get('h')),
})
self._sort_formats(formats)
return {
'id': display_id,
'fullid': video_id,
'title': data['title'],
'formats': formats,
'uploader': data['channel_name'],
'timestamp': data['pubdate_epoch'],
'description': data.get('description'),
'thumbnails': thumbnails,
'duration': duration,
}

View File

@@ -0,0 +1,83 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class BloombergIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
# The md5 checksum changes
'info_dict': {
'id': 'qurhIVlJSB6hzkVi229d8g',
'ext': 'flv',
'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
'description': 'md5:a8ba0302912d03d246979735c17d2761',
},
'params': {
'format': 'best[format_id^=hds]',
},
}, {
# video ID in BPlayer(...)
'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/',
'info_dict': {
'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74',
'ext': 'flv',
'title': 'Meet the Real-Life Tech Wizards of Middle Earth',
'description': 'Hello World, Episode 1: New Zealands freaky AI babies, robot exoskeletons, and a virtual you.',
},
'params': {
'format': 'best[format_id^=hds]',
},
}, {
# data-bmmrid=
'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money',
'only_matching': True,
}, {
'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
'only_matching': True,
}, {
'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',
'only_matching': True,
}]
def _real_extract(self, url):
name = self._match_id(url)
webpage = self._download_webpage(url, name)
video_id = self._search_regex(
(r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'),
webpage, 'id', group='id', default=None)
if not video_id:
bplayer_data = self._parse_json(self._search_regex(
r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name)
video_id = bplayer_data['id']
title = re.sub(': Video$', '', self._og_search_title(webpage))
embed_info = self._download_json(
'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
formats = []
for stream in embed_info['streams']:
stream_url = stream.get('url')
if not stream_url:
continue
if stream['muxing_format'] == 'TS':
formats.extend(self._extract_m3u8_formats(
stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
else:
formats.extend(self._extract_f4m_formats(
stream_url, video_id, f4m_id='hds', fatal=False))
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@@ -0,0 +1,60 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_parse_qs
from ..utils import ExtractorError
class BokeCCBaseIE(InfoExtractor):
def _extract_bokecc_formats(self, webpage, video_id, format_id=None):
player_params_str = self._html_search_regex(
r'<(?:script|embed)[^>]+src=(?P<q>["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P<query>.+?)(?P=q)',
webpage, 'player params', group='query')
player_params = compat_parse_qs(player_params_str)
info_xml = self._download_xml(
'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
player_params['siteid'][0], player_params['vid'][0]), video_id)
formats = [{
'format_id': format_id,
'url': quality.find('./copy').attrib['playurl'],
'preference': int(quality.attrib['value']),
} for quality in info_xml.findall('./video/quality')]
self._sort_formats(formats)
return formats
class BokeCCIE(BokeCCBaseIE):
_IE_DESC = 'CC视频'
_VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'
_TESTS = [{
'url': 'http://union.bokecc.com/playvideo.bo?vid=E0ABAE9D4F509B189C33DC5901307461&uid=FE644790DE9D154A',
'info_dict': {
'id': 'FE644790DE9D154A_E0ABAE9D4F509B189C33DC5901307461',
'ext': 'flv',
'title': 'BokeCC Video',
},
}]
def _real_extract(self, url):
qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
if not qs.get('vid') or not qs.get('uid'):
raise ExtractorError('Invalid URL', expected=True)
video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0])
webpage = self._download_webpage(url, video_id)
return {
'id': video_id,
'title': 'BokeCC Video', # no title provided in the webpage
'formats': self._extract_bokecc_formats(webpage, video_id),
}

View File

@@ -0,0 +1,72 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
extract_attributes,
)
class BostonGlobeIE(InfoExtractor):
_VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?'
_TESTS = [
{
'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html',
'md5': '0a62181079c85c2d2b618c9a738aedaf',
'info_dict': {
'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood',
'id': '5320421710001',
'ext': 'mp4',
'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.',
'timestamp': 1486877593,
'upload_date': '20170212',
'uploader_id': '245991542',
},
},
{
# Embedded youtube video; we hand it off to the Generic extractor.
'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html',
'md5': '582b40327089d5c0c949b3c54b13c24b',
'info_dict': {
'title': "Who Is Matt Damon's Favorite Batman?",
'id': 'ZW1QCnlA6Qc',
'ext': 'mp4',
'upload_date': '20170217',
'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb',
'uploader': 'The Late Late Show with James Corden',
'uploader_id': 'TheLateLateShow',
},
'expected_warnings': ['404'],
},
]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
page_title = self._og_search_title(webpage, default=None)
# <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject">
entries = []
for video in re.findall(r'(?i)(<video[^>]+>)', webpage):
attrs = extract_attributes(video)
video_id = attrs.get('data-brightcove-video-id')
account_id = attrs.get('data-account')
player_id = attrs.get('data-player')
embed = attrs.get('data-embed')
if video_id and account_id and player_id and embed:
entries.append(
'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
% (account_id, player_id, embed, video_id))
if len(entries) == 0:
return self.url_result(url, 'Generic')
elif len(entries) == 1:
return self.url_result(entries[0], 'BrightcoveNew')
else:
return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew')

View File

@@ -0,0 +1,62 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
js_to_json,
determine_ext,
)
class BpbIE(InfoExtractor):
IE_DESC = 'Bundeszentrale für politische Bildung'
_VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/'
_TEST = {
'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr',
# md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2
'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f',
'info_dict': {
'id': '297',
'ext': 'mp4',
'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.'
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<h2 class="white">(.*?)</h2>', webpage, 'title')
video_info_dicts = re.findall(
r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage)
formats = []
for video_info in video_info_dicts:
video_info = self._parse_json(
video_info, video_id, transform_source=js_to_json, fatal=False)
if not video_info:
continue
video_url = video_info.get('src')
if not video_url:
continue
quality = 'high' if '_high' in video_url else 'low'
formats.append({
'url': video_url,
'preference': 10 if quality == 'high' else 0,
'format_note': quality,
'format_id': '%s-%s' % (quality, determine_ext(video_url)),
})
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'title': title,
'description': self._og_search_description(webpage),
}

311
youtube_dlc/extractor/br.py Normal file
View File

@@ -0,0 +1,311 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
parse_duration,
parse_iso8601,
xpath_element,
xpath_text,
)
class BRIE(InfoExtractor):
IE_DESC = 'Bayerischer Rundfunk'
_VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html'
_TESTS = [
{
'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html',
'md5': '83a0477cf0b8451027eb566d88b51106',
'info_dict': {
'id': '48f656ef-287e-486f-be86-459122db22cc',
'ext': 'mp4',
'title': 'Die böse Überraschung',
'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9',
'duration': 180,
'uploader': 'Reinhard Weber',
'upload_date': '20150422',
},
'skip': '404 not found',
},
{
'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef',
'info_dict': {
'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05',
'ext': 'flv',
'title': 'Manfred Schreiber ist tot',
'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97',
'duration': 26,
},
'skip': '404 not found',
},
{
'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html',
'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d',
'info_dict': {
'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b',
'ext': 'aac',
'title': 'Kurzweilig und sehr bewegend',
'description': 'md5:0351996e3283d64adeb38ede91fac54e',
'duration': 296,
},
'skip': '404 not found',
},
{
'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html',
'md5': 'dbab0aef2e047060ea7a21fc1ce1078a',
'info_dict': {
'id': '6ba73750-d405-45d3-861d-1ce8c524e059',
'ext': 'mp4',
'title': 'Umweltbewusster Häuslebauer',
'description': 'md5:d52dae9792d00226348c1dbb13c9bae2',
'duration': 116,
}
},
{
'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html',
'md5': '23bca295f1650d698f94fc570977dae3',
'info_dict': {
'id': 'd982c9ce-8648-4753-b358-98abb8aec43d',
'ext': 'mp4',
'title': 'Folge 1 - Metaphysik',
'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',
'duration': 893,
'uploader': 'Eva Maria Steimle',
'upload_date': '20170208',
}
},
]
def _real_extract(self, url):
base_url, display_id = re.search(self._VALID_URL, url).groups()
page = self._download_webpage(url, display_id)
xml_url = self._search_regex(
r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL')
xml = self._download_xml(base_url + xml_url, display_id)
medias = []
for xml_media in xml.findall('video') + xml.findall('audio'):
media_id = xml_media.get('externalId')
media = {
'id': media_id,
'title': xpath_text(xml_media, 'title', 'title', True),
'duration': parse_duration(xpath_text(xml_media, 'duration')),
'formats': self._extract_formats(xpath_element(
xml_media, 'assets'), media_id),
'thumbnails': self._extract_thumbnails(xpath_element(
xml_media, 'teaserImage/variants'), base_url),
'description': xpath_text(xml_media, 'desc'),
'webpage_url': xpath_text(xml_media, 'permalink'),
'uploader': xpath_text(xml_media, 'author'),
}
broadcast_date = xpath_text(xml_media, 'broadcastDate')
if broadcast_date:
media['upload_date'] = ''.join(reversed(broadcast_date.split('.')))
medias.append(media)
if len(medias) > 1:
self._downloader.report_warning(
'found multiple medias; please '
'report this with the video URL to http://yt-dl.org/bug')
if not medias:
raise ExtractorError('No media entries found')
return medias[0]
def _extract_formats(self, assets, media_id):
formats = []
for asset in assets.findall('asset'):
format_url = xpath_text(asset, ['downloadUrl', 'url'])
asset_type = asset.get('type')
if asset_type.startswith('HDS'):
formats.extend(self._extract_f4m_formats(
format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False))
elif asset_type.startswith('HLS'):
formats.extend(self._extract_m3u8_formats(
format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False))
else:
format_info = {
'ext': xpath_text(asset, 'mediaType'),
'width': int_or_none(xpath_text(asset, 'frameWidth')),
'height': int_or_none(xpath_text(asset, 'frameHeight')),
'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')),
'abr': int_or_none(xpath_text(asset, 'bitrateAudio')),
'vcodec': xpath_text(asset, 'codecVideo'),
'acodec': xpath_text(asset, 'codecAudio'),
'container': xpath_text(asset, 'mediaType'),
'filesize': int_or_none(xpath_text(asset, 'size')),
}
format_url = self._proto_relative_url(format_url)
if format_url:
http_format_info = format_info.copy()
http_format_info.update({
'url': format_url,
'format_id': 'http-%s' % asset_type,
})
formats.append(http_format_info)
server_prefix = xpath_text(asset, 'serverPrefix')
if server_prefix:
rtmp_format_info = format_info.copy()
rtmp_format_info.update({
'url': server_prefix,
'play_path': xpath_text(asset, 'fileName'),
'format_id': 'rtmp-%s' % asset_type,
})
formats.append(rtmp_format_info)
self._sort_formats(formats)
return formats
def _extract_thumbnails(self, variants, base_url):
thumbnails = [{
'url': base_url + xpath_text(variant, 'url'),
'width': int_or_none(xpath_text(variant, 'width')),
'height': int_or_none(xpath_text(variant, 'height')),
} for variant in variants.findall('variant') if xpath_text(variant, 'url')]
thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True)
return thumbnails
class BRMediathekIE(InfoExtractor):
IE_DESC = 'Bayerischer Rundfunk Mediathek'
_VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})'
_TESTS = [{
'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e',
'md5': 'fdc3d485835966d1622587d08ba632ec',
'info_dict': {
'id': 'av:5a1e6a6e8fce6d001871cc8e',
'ext': 'mp4',
'title': 'Die Sendung vom 28.11.2017',
'description': 'md5:6000cdca5912ab2277e5b7339f201ccc',
'timestamp': 1511942766,
'upload_date': '20171129',
}
}]
def _real_extract(self, url):
clip_id = self._match_id(url)
clip = self._download_json(
'https://proxy-base.master.mango.express/graphql',
clip_id, data=json.dumps({
"query": """{
viewer {
clip(id: "%s") {
title
description
duration
createdAt
ageRestriction
videoFiles {
edges {
node {
publicLocation
fileSize
videoProfile {
width
height
bitrate
encoding
}
}
}
}
captionFiles {
edges {
node {
publicLocation
}
}
}
teaserImages {
edges {
node {
imageFiles {
edges {
node {
publicLocation
width
height
}
}
}
}
}
}
}
}
}""" % clip_id}).encode(), headers={
'Content-Type': 'application/json',
})['data']['viewer']['clip']
title = clip['title']
formats = []
for edge in clip.get('videoFiles', {}).get('edges', []):
node = edge.get('node', {})
n_url = node.get('publicLocation')
if not n_url:
continue
ext = determine_ext(n_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
n_url, clip_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
else:
video_profile = node.get('videoProfile', {})
tbr = int_or_none(video_profile.get('bitrate'))
format_id = 'http'
if tbr:
format_id += '-%d' % tbr
formats.append({
'format_id': format_id,
'url': n_url,
'width': int_or_none(video_profile.get('width')),
'height': int_or_none(video_profile.get('height')),
'tbr': tbr,
'filesize': int_or_none(node.get('fileSize')),
})
self._sort_formats(formats)
subtitles = {}
for edge in clip.get('captionFiles', {}).get('edges', []):
node = edge.get('node', {})
n_url = node.get('publicLocation')
if not n_url:
continue
subtitles.setdefault('de', []).append({
'url': n_url,
})
thumbnails = []
for edge in clip.get('teaserImages', {}).get('edges', []):
for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []):
node = image_edge.get('node', {})
n_url = node.get('publicLocation')
if not n_url:
continue
thumbnails.append({
'url': n_url,
'width': int_or_none(node.get('width')),
'height': int_or_none(node.get('height')),
})
return {
'id': clip_id,
'title': title,
'description': clip.get('description'),
'duration': int_or_none(clip.get('duration')),
'timestamp': parse_iso8601(clip.get('createdAt')),
'age_limit': int_or_none(clip.get('ageRestriction')),
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
}

View File

@@ -0,0 +1,84 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .adobepass import AdobePassIE
from ..utils import (
smuggle_url,
update_url_query,
int_or_none,
)
class BravoTVIE(AdobePassIE):
_VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is',
'md5': 'e34684cfea2a96cd2ee1ef3a60909de9',
'info_dict': {
'id': 'epL0pmK1kQlT',
'ext': 'mp4',
'title': 'The Top Chef Season 16 Winner Is...',
'description': 'Find out who takes the title of Top Chef!',
'uploader': 'NBCU-BRAV',
'upload_date': '20190314',
'timestamp': 1552591860,
}
}, {
'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
settings = self._parse_json(self._search_regex(
r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'),
display_id)
info = {}
query = {
'mbr': 'true',
}
account_pid, release_pid = [None] * 2
tve = settings.get('ls_tve')
if tve:
query['manifest'] = 'm3u'
mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage)
if mobj:
account_pid, tp_path = mobj.groups()
release_pid = tp_path.strip('/').split('/')[-1]
else:
account_pid = 'HNK2IC'
tp_path = release_pid = tve['release_pid']
if tve.get('entitlement') == 'auth':
adobe_pass = settings.get('tve_adobe_auth', {})
resource = self._get_mvpd_resource(
adobe_pass.get('adobePassResourceId', 'bravo'),
tve['title'], release_pid, tve.get('rating'))
query['auth'] = self._extract_mvpd_auth(
url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource)
else:
shared_playlist = settings['ls_playlist']
account_pid = shared_playlist['account_pid']
metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']]
tp_path = release_pid = metadata.get('release_pid')
if not release_pid:
release_pid = metadata['guid']
tp_path = 'media/guid/2140479951/' + release_pid
info.update({
'title': metadata['title'],
'description': metadata.get('description'),
'season_number': int_or_none(metadata.get('season_num')),
'episode_number': int_or_none(metadata.get('episode_num')),
})
query['switch'] = 'progressive'
info.update({
'_type': 'url_transparent',
'id': release_pid,
'url': smuggle_url(update_url_query(
'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path),
query), {'force_smil_url': True}),
'ie_key': 'ThePlatform',
})
return info

View File

@@ -0,0 +1,91 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
int_or_none,
url_or_none,
)
class BreakIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)'
_TESTS = [{
'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056',
'info_dict': {
'id': '2468056',
'ext': 'mp4',
'title': 'When Girls Act Like D-Bags',
'age_limit': 13,
},
}, {
# youtube embed
'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work',
'info_dict': {
'id': 'RrrDLdeL2HQ',
'ext': 'mp4',
'title': 'Whale Watching Boat Crashing Into San Diego Dock',
'description': 'md5:afc1b2772f0a8468be51dd80eb021069',
'upload_date': '20160331',
'uploader': 'Steve Holden',
'uploader_id': 'sdholden07',
},
'params': {
'skip_download': True,
}
}, {
'url': 'http://www.break.com/video/ugc/baby-flex-2773063',
'only_matching': True,
}]
def _real_extract(self, url):
display_id, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
youtube_url = YoutubeIE._extract_url(webpage)
if youtube_url:
return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
content = self._parse_json(
self._search_regex(
r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage,
'content'),
display_id)
formats = []
for video in content:
video_url = url_or_none(video.get('url'))
if not video_url:
continue
bitrate = int_or_none(self._search_regex(
r'(\d+)_kbps', video_url, 'tbr', default=None))
formats.append({
'url': video_url,
'format_id': 'http-%d' % bitrate if bitrate else 'http',
'tbr': bitrate,
})
self._sort_formats(formats)
title = self._search_regex(
(r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value')
def get(key, name):
return int_or_none(self._search_regex(
r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name,
default=None))
age_limit = get('ratings', 'age limit')
video_id = video_id or get('pid', 'video id') or display_id
return {
'id': video_id,
'display_id': display_id,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'age_limit': age_limit,
'formats': formats,
}

View File

@@ -0,0 +1,677 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
import re
import struct
from .adobepass import AdobePassIE
from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
compat_HTTPError,
compat_parse_qs,
compat_urllib_parse_urlparse,
compat_urlparse,
compat_xml_parse_error,
)
from ..utils import (
clean_html,
extract_attributes,
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
float_or_none,
int_or_none,
js_to_json,
mimetype2ext,
parse_iso8601,
smuggle_url,
str_or_none,
unescapeHTML,
unsmuggle_url,
UnsupportedError,
update_url_query,
url_or_none,
)
class BrightcoveLegacyIE(InfoExtractor):
IE_NAME = 'brightcove:legacy'
_VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
_TESTS = [
{
# From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
'md5': '5423e113865d26e40624dce2e4b45d95',
'note': 'Test Brightcove downloads and detection in GenericIE',
'info_dict': {
'id': '2371591881001',
'ext': 'mp4',
'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
'uploader': '8TV',
'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
'timestamp': 1368213670,
'upload_date': '20130510',
'uploader_id': '1589608506001',
},
'skip': 'The player has been deactivated by the content owner',
},
{
# From http://medianetwork.oracle.com/video/player/1785452137001
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
'info_dict': {
'id': '1785452137001',
'ext': 'flv',
'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
'uploader': 'Oracle',
'timestamp': 1344975024,
'upload_date': '20120814',
'uploader_id': '1460825906',
},
'skip': 'video not playable',
},
{
# From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
'info_dict': {
'id': '2750934548001',
'ext': 'mp4',
'title': 'This Bracelet Acts as a Personal Thermostat',
'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
# 'uploader': 'Mashable',
'timestamp': 1382041798,
'upload_date': '20131017',
'uploader_id': '1130468786001',
},
},
{
# test that the default referer works
# from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
'info_dict': {
'id': '2878862109001',
'ext': 'mp4',
'title': 'Lost in Motion II',
'description': 'md5:363109c02998fee92ec02211bd8000df',
'uploader': 'National Ballet of Canada',
},
'skip': 'Video gone',
},
{
# test flv videos served by akamaihd.net
# From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D',
# The md5 checksum changes on each download
'info_dict': {
'id': '3750436379001',
'ext': 'flv',
'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
'uploader': 'RBTV Old (do not use)',
'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
'timestamp': 1409122195,
'upload_date': '20140827',
'uploader_id': '710858724001',
},
'skip': 'Video gone',
},
{
# playlist with 'videoList'
# from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
'info_dict': {
'title': 'Sealife',
'id': '3550319591001',
},
'playlist_mincount': 7,
'skip': 'Unsupported URL',
},
{
# playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965)
'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg',
'info_dict': {
'id': '1522758701001',
'title': 'Lesson 08',
},
'playlist_mincount': 10,
'skip': 'Unsupported URL',
},
{
# playerID inferred from bcpid
# from http://www.un.org/chinese/News/story.asp?NewsID=27724
'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350',
'only_matching': True, # Tested in GenericIE
}
]
@classmethod
def _build_brighcove_url(cls, object_str):
"""
Build a Brightcove url from a xml string containing
<object class="BrightcoveExperience">{params}</object>
"""
# Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553
object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>',
lambda m: m.group(1) + '/>', object_str)
# Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608
object_str = object_str.replace('<--', '<!--')
# remove namespace to simplify extraction
object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str)
object_str = fix_xml_ampersands(object_str)
try:
object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
except compat_xml_parse_error:
return
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
if fv_el is not None:
flashvars = dict(
(k, v[0])
for k, v in compat_parse_qs(fv_el.attrib['value']).items())
else:
flashvars = {}
data_url = object_doc.attrib.get('data', '')
data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query)
def find_param(name):
if name in flashvars:
return flashvars[name]
node = find_xpath_attr(object_doc, './param', 'name', name)
if node is not None:
return node.attrib['value']
return data_url_params.get(name)
params = {}
playerID = find_param('playerID') or find_param('playerId')
if playerID is None:
raise ExtractorError('Cannot find player ID')
params['playerID'] = playerID
playerKey = find_param('playerKey')
# Not all pages define this value
if playerKey is not None:
params['playerKey'] = playerKey
# These fields hold the id of the video
videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList')
if videoPlayer is not None:
if isinstance(videoPlayer, list):
videoPlayer = videoPlayer[0]
videoPlayer = videoPlayer.strip()
# UUID is also possible for videoPlayer (e.g.
# http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd
# or http://www8.hp.com/cn/zh/home.html)
if not (re.match(
r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$',
videoPlayer) or videoPlayer.startswith('ref:')):
return None
params['@videoPlayer'] = videoPlayer
linkBase = find_param('linkBaseURL')
if linkBase is not None:
params['linkBaseURL'] = linkBase
return cls._make_brightcove_url(params)
@classmethod
def _build_brighcove_url_from_js(cls, object_js):
# The layout of JS is as follows:
# customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
# // build Brightcove <object /> XML
# }
m = re.search(
r'''(?x)customBC\.createVideo\(
.*? # skipping width and height
["\'](?P<playerID>\d+)["\']\s*,\s* # playerID
["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters
# in length, however it's appended to itself
# in places, so truncate
["\'](?P<videoID>\d+)["\'] # @videoPlayer
''', object_js)
if m:
return cls._make_brightcove_url(m.groupdict())
@classmethod
def _make_brightcove_url(cls, params):
return update_url_query(
'http://c.brightcove.com/services/viewer/htmlFederated', params)
@classmethod
def _extract_brightcove_url(cls, webpage):
"""Try to extract the brightcove url from the webpage, returns None
if it can't be found
"""
urls = cls._extract_brightcove_urls(webpage)
return urls[0] if urls else None
@classmethod
def _extract_brightcove_urls(cls, webpage):
"""Return a list of all Brightcove URLs from the webpage """
url_m = re.search(
r'''(?x)
<meta\s+
(?:property|itemprop)=([\'"])(?:og:video|embedURL)\1[^>]+
content=([\'"])(?P<url>https?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2
''', webpage)
if url_m:
url = unescapeHTML(url_m.group('url'))
# Some sites don't add it, we can't download with this url, for example:
# http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/
if 'playerKey' in url or 'videoId' in url or 'idVideo' in url:
return [url]
matches = re.findall(
r'''(?sx)<object
(?:
[^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] |
[^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
).+?>\s*</object>''',
webpage)
if matches:
return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)
if matches:
return list(filter(None, [
cls._build_brighcove_url_from_js(custom_bc)
for custom_bc in matches]))
return [src for _, src in re.findall(
r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)]
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
# Change the 'videoId' and others field to '@videoPlayer'
url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url)
# Change bckey (used by bcove.me urls) to playerKey
url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
mobj = re.match(self._VALID_URL, url)
query_str = mobj.group('query')
query = compat_urlparse.parse_qs(query_str)
videoPlayer = query.get('@videoPlayer')
if videoPlayer:
# We set the original url as the default 'Referer' header
referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url)
video_id = videoPlayer[0]
if 'playerID' not in query:
mobj = re.search(r'/bcpid(\d+)', url)
if mobj is not None:
query['playerID'] = [mobj.group(1)]
publisher_id = query.get('publisherId')
if publisher_id and publisher_id[0].isdigit():
publisher_id = publisher_id[0]
if not publisher_id:
player_key = query.get('playerKey')
if player_key and ',' in player_key[0]:
player_key = player_key[0]
else:
player_id = query.get('playerID')
if player_id and player_id[0].isdigit():
headers = {}
if referer:
headers['Referer'] = referer
player_page = self._download_webpage(
'http://link.brightcove.com/services/player/bcpid' + player_id[0],
video_id, headers=headers, fatal=False)
if player_page:
player_key = self._search_regex(
r'<param\s+name="playerKey"\s+value="([\w~,-]+)"',
player_page, 'player key', fatal=False)
if player_key:
enc_pub_id = player_key.split(',')[1].replace('~', '=')
publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0]
if publisher_id:
brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id)
if referer:
brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer})
return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id)
# TODO: figure out if it's possible to extract playlistId from playerKey
# elif 'playerKey' in query:
# player_key = query['playerKey']
# return self._get_playlist_info(player_key[0])
raise UnsupportedError(url)
class BrightcoveNewIE(AdobePassIE):
IE_NAME = 'brightcove:new'
_VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)'
_TESTS = [{
'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
'md5': 'c8100925723840d4b0d243f7025703be',
'info_dict': {
'id': '4463358922001',
'ext': 'mp4',
'title': 'Meet the man behind Popcorn Time',
'description': 'md5:eac376a4fe366edc70279bfb681aea16',
'duration': 165.768,
'timestamp': 1441391203,
'upload_date': '20150904',
'uploader_id': '929656772001',
'formats': 'mincount:20',
},
}, {
# with rtmp streams
'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001',
'info_dict': {
'id': '4279049078001',
'ext': 'mp4',
'title': 'Titansgrave: Chapter 0',
'description': 'Titansgrave: Chapter 0',
'duration': 1242.058,
'timestamp': 1433556729,
'upload_date': '20150606',
'uploader_id': '4036320279001',
'formats': 'mincount:39',
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
# playlist stream
'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001',
'info_dict': {
'id': '5718313430001',
'title': 'No Audio Playlist',
},
'playlist_count': 7,
'params': {
# m3u8 download
'skip_download': True,
}
}, {
'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001',
'only_matching': True,
}, {
# ref: prefixed video id
'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442',
'only_matching': True,
}, {
# non numeric ref: prefixed video id
'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356',
'only_matching': True,
}, {
# unavailable video without message but with error_code
'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001',
'only_matching': True,
}]
@staticmethod
def _extract_url(ie, webpage):
urls = BrightcoveNewIE._extract_urls(ie, webpage)
return urls[0] if urls else None
@staticmethod
def _extract_urls(ie, webpage):
# Reference:
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
# 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
# 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
# 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html
# 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
entries = []
# Look for iframe embeds [1]
for _, url in re.findall(
r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
entries.append(url if url.startswith('http') else 'http:' + url)
# Look for <video> tags [2] and embed_in_page embeds [3]
# [2] looks like:
for video, script_tag, account_id, player_id, embed in re.findall(
r'''(?isx)
(<video(?:-js)?\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>)
(?:.*?
(<script[^>]+
src=["\'](?:https?:)?//players\.brightcove\.net/
(\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
)
)?
''', webpage):
attrs = extract_attributes(video)
# According to examples from [4] it's unclear whether video id
# may be optional and what to do when it is
video_id = attrs.get('data-video-id')
if not video_id:
continue
account_id = account_id or attrs.get('data-account')
if not account_id:
continue
player_id = player_id or attrs.get('data-player') or 'default'
embed = embed or attrs.get('data-embed') or 'default'
bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (
account_id, player_id, embed, video_id)
# Some brightcove videos may be embedded with video tag only and
# without script tag or any mentioning of brightcove at all. Such
# embeds are considered ambiguous since they are matched based only
# on data-video-id and data-account attributes and in the wild may
# not be brightcove embeds at all. Let's check reconstructed
# brightcove URLs in case of such embeds and only process valid
# ones. By this we ensure there is indeed a brightcove embed.
if not script_tag and not ie._is_valid_url(
bc_url, video_id, 'possible brightcove video'):
continue
entries.append(bc_url)
return entries
def _parse_brightcove_metadata(self, json_data, video_id, headers={}):
title = json_data['name'].strip()
formats = []
for source in json_data.get('sources', []):
container = source.get('container')
ext = mimetype2ext(source.get('type'))
src = source.get('src')
# https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
if ext == 'ism' or container == 'WVM' or source.get('key_systems'):
continue
elif ext == 'm3u8' or container == 'M2TS':
if not src:
continue
formats.extend(self._extract_m3u8_formats(
src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
elif ext == 'mpd':
if not src:
continue
formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False))
else:
streaming_src = source.get('streaming_src')
stream_name, app_name = source.get('stream_name'), source.get('app_name')
if not src and not streaming_src and (not stream_name or not app_name):
continue
tbr = float_or_none(source.get('avg_bitrate'), 1000)
height = int_or_none(source.get('height'))
width = int_or_none(source.get('width'))
f = {
'tbr': tbr,
'filesize': int_or_none(source.get('size')),
'container': container,
'ext': ext or container.lower(),
}
if width == 0 and height == 0:
f.update({
'vcodec': 'none',
})
else:
f.update({
'width': width,
'height': height,
'vcodec': source.get('codec'),
})
def build_format_id(kind):
format_id = kind
if tbr:
format_id += '-%dk' % int(tbr)
if height:
format_id += '-%dp' % height
return format_id
if src or streaming_src:
f.update({
'url': src or streaming_src,
'format_id': build_format_id('http' if src else 'http-streaming'),
'source_preference': 0 if src else -1,
})
else:
f.update({
'url': app_name,
'play_path': stream_name,
'format_id': build_format_id('rtmp'),
})
formats.append(f)
if not formats:
# for sonyliv.com DRM protected videos
s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl')
if s3_source_url:
formats.append({
'url': s3_source_url,
'format_id': 'source',
})
errors = json_data.get('errors')
if not formats and errors:
error = errors[0]
raise ExtractorError(
error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
self._sort_formats(formats)
for f in formats:
f.setdefault('http_headers', {}).update(headers)
subtitles = {}
for text_track in json_data.get('text_tracks', []):
if text_track.get('kind') != 'captions':
continue
text_track_url = url_or_none(text_track.get('src'))
if not text_track_url:
continue
lang = (str_or_none(text_track.get('srclang'))
or str_or_none(text_track.get('label')) or 'en').lower()
subtitles.setdefault(lang, []).append({
'url': text_track_url,
})
is_live = False
duration = float_or_none(json_data.get('duration'), 1000)
if duration is not None and duration <= 0:
is_live = True
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
'description': clean_html(json_data.get('description')),
'thumbnail': json_data.get('thumbnail') or json_data.get('poster'),
'duration': duration,
'timestamp': parse_iso8601(json_data.get('published_at')),
'uploader_id': json_data.get('account_id'),
'formats': formats,
'subtitles': subtitles,
'tags': json_data.get('tags', []),
'is_live': is_live,
}
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
self._initialize_geo_bypass({
'countries': smuggled_data.get('geo_countries'),
'ip_blocks': smuggled_data.get('geo_ip_blocks'),
})
account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups()
policy_key_id = '%s_%s' % (account_id, player_id)
policy_key = self._downloader.cache.load('brightcove', policy_key_id)
policy_key_extracted = False
store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x)
def extract_policy_key():
webpage = self._download_webpage(
'http://players.brightcove.net/%s/%s_%s/index.min.js'
% (account_id, player_id, embed), video_id)
policy_key = None
catalog = self._search_regex(
r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
if catalog:
catalog = self._parse_json(
js_to_json(catalog), video_id, fatal=False)
if catalog:
policy_key = catalog.get('policyKey')
if not policy_key:
policy_key = self._search_regex(
r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
webpage, 'policy key', group='pk')
store_pk(policy_key)
return policy_key
api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id)
headers = {}
referrer = smuggled_data.get('referrer')
if referrer:
headers.update({
'Referer': referrer,
'Origin': re.search(r'https?://[^/]+', referrer).group(0),
})
for _ in range(2):
if not policy_key:
policy_key = extract_policy_key()
policy_key_extracted = True
headers['Accept'] = 'application/json;pk=%s' % policy_key
try:
json_data = self._download_json(api_url, video_id, headers=headers)
break
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
json_data = self._parse_json(e.cause.read().decode(), video_id)[0]
message = json_data.get('message') or json_data['error_code']
if json_data.get('error_subcode') == 'CLIENT_GEO':
self.raise_geo_restricted(msg=message)
elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted:
policy_key = None
store_pk(None)
continue
raise ExtractorError(message, expected=True)
raise
errors = json_data.get('errors')
if errors and errors[0].get('error_subcode') == 'TVE_AUTH':
custom_fields = json_data['custom_fields']
tve_token = self._extract_mvpd_auth(
smuggled_data['source_url'], video_id,
custom_fields['bcadobepassrequestorid'],
custom_fields['bcadobepassresourceid'])
json_data = self._download_json(
api_url, video_id, headers={
'Accept': 'application/json;pk=%s' % policy_key
}, query={
'tveToken': tve_token,
})
if content_type == 'playlist':
return self.playlist_result(
[self._parse_brightcove_metadata(vid, vid.get('id'), headers)
for vid in json_data.get('videos', []) if vid.get('id')],
json_data.get('id'), json_data.get('name'),
json_data.get('description'))
return self._parse_brightcove_metadata(
json_data, video_id, headers=headers)

View File

@@ -0,0 +1,48 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .jwplatform import JWPlatformIE
class BusinessInsiderIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6',
'md5': 'ffed3e1e12a6f950aa2f7d83851b497a',
'info_dict': {
'id': 'cjGDb0X9',
'ext': 'mp4',
'title': "Bananas give you more radiation exposure than living next to a nuclear power plant",
'description': 'md5:0175a3baf200dd8fa658f94cade841b3',
'upload_date': '20160611',
'timestamp': 1465675620,
},
}, {
'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/',
'md5': '43f438dbc6da0b89f5ac42f68529d84a',
'info_dict': {
'id': '5zJwd4FK',
'ext': 'mp4',
'title': 'Deze dingen zorgen ervoor dat je minder snel een date scoort',
'description': 'md5:2af8975825d38a4fed24717bbe51db49',
'upload_date': '20170705',
'timestamp': 1499270528,
},
}, {
'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
jwplatform_id = self._search_regex(
(r'data-media-id=["\']([a-zA-Z0-9]{8})',
r'id=["\']jwplayer_([a-zA-Z0-9]{8})',
r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})',
r'(?:jwplatform\.com/players/|jwplayer_)([a-zA-Z0-9]{8})'),
webpage, 'jwplatform id')
return self.url_result(
'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(),
video_id=video_id)

View File

@@ -0,0 +1,98 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from .facebook import FacebookIE
class BuzzFeedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?buzzfeed\.com/[^?#]*?/(?P<id>[^?#]+)'
_TESTS = [{
'url': 'http://www.buzzfeed.com/abagg/this-angry-ram-destroys-a-punching-bag-like-a-boss?utm_term=4ldqpia',
'info_dict': {
'id': 'this-angry-ram-destroys-a-punching-bag-like-a-boss',
'title': 'This Angry Ram Destroys A Punching Bag Like A Boss',
'description': 'Rambro!',
},
'playlist': [{
'info_dict': {
'id': 'aVCR29aE_OQ',
'ext': 'mp4',
'title': 'Angry Ram destroys a punching bag..',
'description': 'md5:c59533190ef23fd4458a5e8c8c872345',
'upload_date': '20141024',
'uploader_id': 'Buddhanz1',
'uploader': 'Angry Ram',
}
}]
}, {
'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia',
'params': {
'skip_download': True, # Got enough YouTube download tests
},
'info_dict': {
'id': 'look-at-this-cute-dog-omg',
'description': 're:Munchkin the Teddy Bear is back ?!',
'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',
},
'playlist': [{
'info_dict': {
'id': 'mVmBL8B-In0',
'ext': 'mp4',
'title': 're:Munchkin the Teddy Bear gets her exercise',
'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',
'upload_date': '20141124',
'uploader_id': 'CindysMunchkin',
'uploader': 're:^Munchkin the',
},
}]
}, {
'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK',
'info_dict': {
'id': 'the-most-adorable-crash-landing-ever',
'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing',
'description': 'This gosling knows how to stick a landing.',
},
'playlist': [{
'md5': '763ca415512f91ca62e4621086900a23',
'info_dict': {
'id': '971793786185728',
'ext': 'mp4',
'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...',
'uploader': 'Calgary Outdoor Centre-University of Calgary',
},
}],
'add_ie': ['Facebook'],
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
all_buckets = re.findall(
r'(?s)<div class="video-embed[^"]*"..*?rel:bf_bucket_data=\'([^\']+)\'',
webpage)
entries = []
for bd_json in all_buckets:
bd = json.loads(bd_json)
video = bd.get('video') or bd.get('progload_video')
if not video:
continue
entries.append(self.url_result(video['url']))
facebook_urls = FacebookIE._extract_urls(webpage)
entries.extend([
self.url_result(facebook_url)
for facebook_url in facebook_urls])
return {
'_type': 'playlist',
'id': playlist_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'entries': entries,
}

View File

@@ -0,0 +1,117 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
merge_dicts,
parse_duration,
url_or_none,
)
class BYUtvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?'
_TESTS = [{
# ooyalaVOD
'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
'info_dict': {
'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH',
'display_id': 'studio-c-season-5-episode-5',
'ext': 'mp4',
'title': 'Season 5 Episode 5',
'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65',
'thumbnail': r're:^https?://.*',
'duration': 1486.486,
},
'params': {
'skip_download': True,
},
'add_ie': ['Ooyala'],
}, {
# dvr
'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2',
'info_dict': {
'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451',
'display_id': 'byu-softball-pacific-vs-byu-41219---game-2',
'ext': 'mp4',
'title': 'Pacific vs. BYU (4/12/19)',
'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3',
'duration': 11645,
},
'params': {
'skip_download': True
},
}, {
'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d',
'only_matching': True,
}, {
'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
video = self._download_json(
'https://api.byutv.org/api3/catalog/getvideosforcontent',
display_id, query={
'contentid': video_id,
'channel': 'byutv',
'x-byutv-context': 'web$US',
}, headers={
'x-byutv-context': 'web$US',
'x-byutv-platformkey': 'xsaaw9c7y5',
})
ep = video.get('ooyalaVOD')
if ep:
return {
'_type': 'url_transparent',
'ie_key': 'Ooyala',
'url': 'ooyala:%s' % ep['providerId'],
'id': video_id,
'display_id': display_id,
'title': ep.get('title'),
'description': ep.get('description'),
'thumbnail': ep.get('imageThumbnail'),
}
info = {}
formats = []
for format_id, ep in video.items():
if not isinstance(ep, dict):
continue
video_url = url_or_none(ep.get('videoUrl'))
if not video_url:
continue
ext = determine_ext(video_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
video_url, video_id, mpd_id='dash', fatal=False))
else:
formats.append({
'url': video_url,
'format_id': format_id,
})
merge_dicts(info, {
'title': ep.get('title'),
'description': ep.get('description'),
'thumbnail': ep.get('imageThumbnail'),
'duration': parse_duration(ep.get('length')),
})
self._sort_formats(formats)
return merge_dicts(info, {
'id': video_id,
'display_id': display_id,
'title': display_id,
'formats': formats,
})

View File

@@ -0,0 +1,65 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import js_to_json
class C56IE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
IE_NAME = '56.com'
_TESTS = [{
'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
'md5': 'e59995ac63d0457783ea05f93f12a866',
'info_dict': {
'id': '93440716',
'ext': 'flv',
'title': '网事知多少 第32期车怒',
'duration': 283.813,
},
}, {
'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html',
'md5': '',
'info_dict': {
'id': '82247482',
'title': '爱的诅咒之杜鹃花开',
},
'playlist_count': 7,
'add_ie': ['Sohu'],
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
text_id = mobj.group('textid')
webpage = self._download_webpage(url, text_id)
sohu_video_info_str = self._search_regex(
r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None)
if sohu_video_info_str:
sohu_video_info = self._parse_json(
sohu_video_info_str, text_id, transform_source=js_to_json)
return self.url_result(sohu_video_info['url'], 'Sohu')
page = self._download_json(
'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
info = page['info']
formats = [
{
'format_id': f['type'],
'filesize': int(f['filesize']),
'url': f['url']
} for f in info['rfiles']
]
self._sort_formats(formats)
return {
'id': info['vid'],
'title': info['Subject'],
'duration': int(info['duration']) / 1000.0,
'formats': formats,
'thumbnail': info.get('bimg') or info.get('img'),
}

View File

@@ -0,0 +1,161 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
clean_html,
parse_duration,
str_to_int,
unified_strdate,
)
class CamdemyIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
_TESTS = [{
# single file
'url': 'http://www.camdemy.com/media/5181/',
'md5': '5a5562b6a98b37873119102e052e311b',
'info_dict': {
'id': '5181',
'ext': 'mp4',
'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
'thumbnail': r're:^https?://.*\.jpg$',
'creator': 'ss11spring',
'duration': 1591,
'upload_date': '20130114',
'view_count': int,
}
}, {
# With non-empty description
# webpage returns "No permission or not login"
'url': 'http://www.camdemy.com/media/13885',
'md5': '4576a3bb2581f86c61044822adbd1249',
'info_dict': {
'id': '13885',
'ext': 'mp4',
'title': 'EverCam + Camdemy QuickStart',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
'creator': 'evercam',
'duration': 318,
}
}, {
# External source (YouTube)
'url': 'http://www.camdemy.com/media/14842',
'info_dict': {
'id': '2vsYQzNIsJo',
'ext': 'mp4',
'title': 'Excel 2013 Tutorial - How to add Password Protection',
'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
'upload_date': '20130211',
'uploader': 'Hun Kim',
'uploader_id': 'hunkimtutorials',
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
src_from = self._html_search_regex(
r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
webpage, 'external source', default=None, group='url')
if src_from:
return self.url_result(src_from)
oembed_obj = self._download_json(
'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
title = oembed_obj['title']
thumb_url = oembed_obj['thumbnail_url']
video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
file_list_doc = self._download_xml(
compat_urlparse.urljoin(video_folder, 'fileList.xml'),
video_id, 'Downloading filelist XML')
file_name = file_list_doc.find('./video/item/fileName').text
video_url = compat_urlparse.urljoin(video_folder, file_name)
# Some URLs return "No permission or not login" in a webpage despite being
# freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
upload_date = unified_strdate(self._search_regex(
r'>published on ([^<]+)<', webpage,
'upload date', default=None))
view_count = str_to_int(self._search_regex(
r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
webpage, 'view count', default=None))
description = self._html_search_meta(
'description', webpage, default=None) or clean_html(
oembed_obj.get('description'))
return {
'id': video_id,
'url': video_url,
'title': title,
'thumbnail': thumb_url,
'description': description,
'creator': oembed_obj.get('author_name'),
'duration': parse_duration(oembed_obj.get('duration')),
'upload_date': upload_date,
'view_count': view_count,
}
class CamdemyFolderIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
_TESTS = [{
# links with trailing slash
'url': 'http://www.camdemy.com/folder/450',
'info_dict': {
'id': '450',
'title': '信號與系統 2012 & 2011 (Signals and Systems)',
},
'playlist_mincount': 145
}, {
# links without trailing slash
# and multi-page
'url': 'http://www.camdemy.com/folder/853',
'info_dict': {
'id': '853',
'title': '科學計算 - 使用 Matlab'
},
'playlist_mincount': 20
}, {
# with displayMode parameter. For testing the codes to add parameters
'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
'info_dict': {
'id': '853',
'title': '科學計算 - 使用 Matlab'
},
'playlist_mincount': 20
}]
def _real_extract(self, url):
folder_id = self._match_id(url)
# Add displayMode=list so that all links are displayed in a single page
parsed_url = list(compat_urlparse.urlparse(url))
query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
query.update({'displayMode': 'list'})
parsed_url[4] = compat_urllib_parse_urlencode(query)
final_url = compat_urlparse.urlunparse(parsed_url)
page = self._download_webpage(final_url, folder_id)
matches = re.findall(r"href='(/media/\d+/?)'", page)
entries = [self.url_result('http://www.camdemy.com' + media_path)
for media_path in matches]
folder_title = self._html_search_meta('keywords', page)
return self.playlist_result(entries, folder_id, folder_title)

View File

@@ -0,0 +1,98 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
url_or_none,
)
class CamModelsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.cammodels.com/cam/AutumnKnight/',
'only_matching': True,
'age_limit': 18
}]
def _real_extract(self, url):
user_id = self._match_id(url)
webpage = self._download_webpage(
url, user_id, headers=self.geo_verification_headers())
manifest_root = self._html_search_regex(
r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None)
if not manifest_root:
ERRORS = (
("I'm offline, but let's stay connected", 'This user is currently offline'),
('in a private show', 'This user is in a private show'),
('is currently performing LIVE', 'This model is currently performing live'),
)
for pattern, message in ERRORS:
if pattern in webpage:
error = message
expected = True
break
else:
error = 'Unable to find manifest URL root'
expected = False
raise ExtractorError(error, expected=expected)
manifest = self._download_json(
'%s%s.json' % (manifest_root, user_id), user_id)
formats = []
for format_id, format_dict in manifest['formats'].items():
if not isinstance(format_dict, dict):
continue
encodings = format_dict.get('encodings')
if not isinstance(encodings, list):
continue
vcodec = format_dict.get('videoCodec')
acodec = format_dict.get('audioCodec')
for media in encodings:
if not isinstance(media, dict):
continue
media_url = url_or_none(media.get('location'))
if not media_url:
continue
format_id_list = [format_id]
height = int_or_none(media.get('videoHeight'))
if height is not None:
format_id_list.append('%dp' % height)
f = {
'url': media_url,
'format_id': '-'.join(format_id_list),
'width': int_or_none(media.get('videoWidth')),
'height': height,
'vbr': int_or_none(media.get('videoKbps')),
'abr': int_or_none(media.get('audioKbps')),
'fps': int_or_none(media.get('fps')),
'vcodec': vcodec,
'acodec': acodec,
}
if 'rtmp' in format_id:
f['ext'] = 'flv'
elif 'hls' in format_id:
f.update({
'ext': 'mp4',
# hls skips fragments, preferring rtmp
'preference': -1,
})
else:
continue
formats.append(f)
self._sort_formats(formats)
return {
'id': user_id,
'title': self._live_title(user_id),
'is_live': True,
'formats': formats,
'age_limit': 18
}

View File

@@ -0,0 +1,71 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_timestamp,
)
class CamTubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female',
'info_dict': {
'id': '42ad3956-dd5b-445a-8313-803ea6079fac',
'display_id': 'minafay-030618-1136-chaturbate-female',
'ext': 'mp4',
'title': 'minafay-030618-1136-chaturbate-female',
'duration': 1274,
'timestamp': 1528018608,
'upload_date': '20180603',
'age_limit': 18
},
'params': {
'skip_download': True,
},
}]
_API_BASE = 'https://api.camtube.co'
def _real_extract(self, url):
display_id = self._match_id(url)
token = self._download_json(
'%s/rpc/session/new' % self._API_BASE, display_id,
'Downloading session token')['token']
self._set_cookie('api.camtube.co', 'session', token)
video = self._download_json(
'%s/recordings/%s' % (self._API_BASE, display_id), display_id,
headers={'Referer': url})
video_id = video['uuid']
timestamp = unified_timestamp(video.get('createdAt'))
duration = int_or_none(video.get('duration'))
view_count = int_or_none(video.get('viewCount'))
like_count = int_or_none(video.get('likeCount'))
creator = video.get('stageName')
formats = [{
'url': '%s/recordings/%s/manifest.m3u8'
% (self._API_BASE, video_id),
'format_id': 'hls',
'ext': 'mp4',
'protocol': 'm3u8_native',
}]
return {
'id': video_id,
'display_id': display_id,
'title': display_id,
'timestamp': timestamp,
'duration': duration,
'view_count': view_count,
'like_count': like_count,
'creator': creator,
'formats': formats,
'age_limit': 18
}

View File

@@ -0,0 +1,89 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_duration,
unified_strdate,
)
class CamWithHerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)'
_TESTS = [{
'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=',
'info_dict': {
'id': '5644',
'ext': 'flv',
'title': 'Periscope Tease',
'description': 'In the clouds teasing on periscope to my favorite song',
'duration': 240,
'view_count': int,
'comment_count': int,
'uploader': 'MileenaK',
'upload_date': '20160322',
'age_limit': 18,
},
'params': {
'skip_download': True,
}
}, {
'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937',
'only_matching': True,
}, {
'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=',
'only_matching': True,
}, {
'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
flv_id = self._html_search_regex(
r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id')
# Video URL construction algorithm is reverse-engineered from cwhplayer.swf
rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % (
('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id)
title = self._html_search_regex(
r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title')
description = self._html_search_regex(
r'>Description:</span>(.+?)</div>', webpage, 'description', default=None)
runtime = self._search_regex(
r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None)
if runtime:
runtime = re.sub(r'[\s-]', '', runtime)
duration = parse_duration(runtime)
view_count = int_or_none(self._search_regex(
r'Views\s*:\s*(\d+)', webpage, 'view count', default=None))
comment_count = int_or_none(self._search_regex(
r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None))
uploader = self._search_regex(
r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None)
upload_date = unified_strdate(self._search_regex(
r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None))
return {
'id': flv_id,
'url': rtmp_url,
'ext': 'flv',
'no_resume': True,
'title': title,
'description': description,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
'uploader': uploader,
'upload_date': upload_date,
'age_limit': 18
}

View File

@@ -0,0 +1,73 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import parse_duration
class Canalc2IE(InfoExtractor):
IE_NAME = 'canalc2.tv'
_VALID_URL = r'https?://(?:(?:www\.)?canalc2\.tv/video/|archives-canalc2\.u-strasbg\.fr/video\.asp\?.*\bidVideo=)(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.canalc2.tv/video/12163',
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
'id': '12163',
'ext': 'mp4',
'title': 'Terrasses du Numérique',
'duration': 122,
},
}, {
'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://www.canalc2.tv/video/%s' % video_id, video_id)
title = self._html_search_regex(
r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.+?)</h3>',
webpage, 'title')
formats = []
for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage):
if video_url.startswith('rtmp://'):
rtmp = re.search(
r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
formats.append({
'url': rtmp.group('url'),
'format_id': 'rtmp',
'ext': 'flv',
'app': rtmp.group('app'),
'play_path': rtmp.group('play_path'),
'page_url': url,
})
else:
formats.append({
'url': video_url,
'format_id': 'http',
})
if formats:
info = {
'formats': formats,
}
else:
info = self._parse_html5_media_entries(url, webpage, url)[0]
self._sort_formats(info['formats'])
info.update({
'id': video_id,
'title': title,
'duration': parse_duration(self._search_regex(
r'id=["\']video_duree["\'][^>]*>([^<]+)',
webpage, 'duration', fatal=False)),
})
return info

View File

@@ -0,0 +1,116 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
# ExtractorError,
# HEADRequest,
int_or_none,
qualities,
unified_strdate,
)
class CanalplusIE(InfoExtractor):
IE_DESC = 'mycanal.fr and piwiplus.fr'
_VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)'
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json'
_SITE_ID_MAP = {
'mycanal': 'cplus',
'piwiplus': 'teletoon',
}
# Only works for direct mp4 URLs
_GEO_COUNTRIES = ['FR']
_TESTS = [{
'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061',
'info_dict': {
'id': '1397061',
'display_id': 'lolywood',
'ext': 'mp4',
'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34',
'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e',
'upload_date': '20160602',
},
}, {
# geo restricted, bypassed
'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
'info_dict': {
'id': '1108190',
'display_id': 'pid1405-le-labyrinthe-boing-super-ranger',
'ext': 'mp4',
'title': 'BOING SUPER RANGER - Ep : Le labyrinthe',
'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff',
'upload_date': '20140724',
},
'expected_warnings': ['HTTP Error 403: Forbidden'],
}]
def _real_extract(self, url):
site, display_id, video_id = re.match(self._VALID_URL, url).groups()
site_id = self._SITE_ID_MAP[site]
info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id)
video_data = self._download_json(info_url, video_id, 'Downloading video JSON')
if isinstance(video_data, list):
video_data = [video for video in video_data if video.get('ID') == video_id][0]
media = video_data['MEDIA']
infos = video_data['INFOS']
preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD'])
# _, fmt_url = next(iter(media['VIDEOS'].items()))
# if '/geo' in fmt_url.lower():
# response = self._request_webpage(
# HEADRequest(fmt_url), video_id,
# 'Checking if the video is georestricted')
# if '/blocage' in response.geturl():
# raise ExtractorError(
# 'The video is not available in your country',
# expected=True)
formats = []
for format_id, format_url in media['VIDEOS'].items():
if not format_url:
continue
if format_id == 'HLS':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False))
elif format_id == 'HDS':
formats.extend(self._extract_f4m_formats(
format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False))
else:
formats.append({
# the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js
'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes',
'format_id': format_id,
'preference': preference(format_id),
})
self._sort_formats(formats)
thumbnails = [{
'id': image_id,
'url': image_url,
} for image_id, image_url in media.get('images', {}).items()]
titrage = infos['TITRAGE']
return {
'id': video_id,
'display_id': display_id,
'title': '%s - %s' % (titrage['TITRE'],
titrage['SOUS_TITRE']),
'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')),
'thumbnails': thumbnails,
'description': infos.get('DESCRIPTION'),
'duration': int_or_none(infos.get('DURATION')),
'view_count': int_or_none(infos.get('NB_VUES')),
'like_count': int_or_none(infos.get('NB_LIKES')),
'comment_count': int_or_none(infos.get('NB_COMMENTS')),
'formats': formats,
}

View File

@@ -0,0 +1,368 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from .gigya import GigyaBaseIE
from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
strip_or_none,
float_or_none,
int_or_none,
merge_dicts,
parse_iso8601,
str_or_none,
url_or_none,
)
class CanvasIE(InfoExtractor):
_VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'md5': '68993eda72ef62386a15ea2cf3c93107',
'info_dict': {
'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'ext': 'mp4',
'title': 'Nachtwacht: De Greystook',
'description': 'Nachtwacht: De Greystook',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1468.04,
},
'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
}, {
'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
'only_matching': True,
}]
_HLS_ENTRY_PROTOCOLS_MAP = {
'HLS': 'm3u8_native',
'HLS_AES': 'm3u8',
}
_REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
site_id, video_id = mobj.group('site_id'), mobj.group('id')
# Old API endpoint, serves more formats but may fail for some videos
data = self._download_json(
'https://mediazone.vrt.be/api/v1/%s/assets/%s'
% (site_id, video_id), video_id, 'Downloading asset JSON',
'Unable to download asset JSON', fatal=False)
# New API endpoint
if not data:
token = self._download_json(
'%s/tokens' % self._REST_API_BASE, video_id,
'Downloading token', data=b'',
headers={'Content-Type': 'application/json'})['vrtPlayerToken']
data = self._download_json(
'%s/videos/%s' % (self._REST_API_BASE, video_id),
video_id, 'Downloading video JSON', fatal=False, query={
'vrtPlayerToken': token,
'client': '%s@PROD' % site_id,
}, expected_status=400)
message = data.get('message')
if message and not data.get('title'):
if data.get('code') == 'AUTHENTICATION_REQUIRED':
self.raise_login_required(message)
raise ExtractorError(message, expected=True)
title = data['title']
description = data.get('description')
formats = []
for target in data['targetUrls']:
format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
if not format_url or not format_type:
continue
format_type = format_type.upper()
if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
m3u8_id=format_type, fatal=False))
elif format_type == 'HDS':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_type, fatal=False))
elif format_type == 'MPEG_DASH':
formats.extend(self._extract_mpd_formats(
format_url, video_id, mpd_id=format_type, fatal=False))
elif format_type == 'HSS':
formats.extend(self._extract_ism_formats(
format_url, video_id, ism_id='mss', fatal=False))
else:
formats.append({
'format_id': format_type,
'url': format_url,
})
self._sort_formats(formats)
subtitles = {}
subtitle_urls = data.get('subtitleUrls')
if isinstance(subtitle_urls, list):
for subtitle in subtitle_urls:
subtitle_url = subtitle.get('url')
if subtitle_url and subtitle.get('type') == 'CLOSED':
subtitles.setdefault('nl', []).append({'url': subtitle_url})
return {
'id': video_id,
'display_id': video_id,
'title': title,
'description': description,
'formats': formats,
'duration': float_or_none(data.get('duration'), 1000),
'thumbnail': data.get('posterImageUrl'),
'subtitles': subtitles,
}
class CanvasEenIE(InfoExtractor):
IE_DESC = 'canvas.be and een.be'
_VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
'md5': 'ed66976748d12350b118455979cca293',
'info_dict': {
'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
'ext': 'flv',
'title': 'De afspraak veilt voor de Warmste Week',
'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 49.02,
},
'expected_warnings': ['is not a supported codec'],
}, {
# with subtitles
'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
'info_dict': {
'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
'display_id': 'pieter-0167',
'ext': 'mp4',
'title': 'Pieter 0167',
'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2553.08,
'subtitles': {
'nl': [{
'ext': 'vtt',
}],
},
},
'params': {
'skip_download': True,
},
'skip': 'Pagina niet gevonden',
}, {
'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan',
'info_dict': {
'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8',
'display_id': 'emma-pakt-thilly-aan',
'ext': 'mp4',
'title': 'Emma pakt Thilly aan',
'description': 'md5:c5c9b572388a99b2690030afa3f3bad7',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 118.24,
},
'params': {
'skip_download': True,
},
'expected_warnings': ['is not a supported codec'],
}, {
'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
site_id, display_id = mobj.group('site_id'), mobj.group('id')
webpage = self._download_webpage(url, display_id)
title = strip_or_none(self._search_regex(
r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
webpage, 'title', default=None) or self._og_search_title(
webpage, default=None))
video_id = self._html_search_regex(
r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
group='id')
return {
'_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id),
'ie_key': CanvasIE.ie_key(),
'id': video_id,
'display_id': display_id,
'title': title,
'description': self._og_search_description(webpage),
}
class VrtNUIE(GigyaBaseIE):
IE_DESC = 'VrtNU.be'
_VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
# Available via old API endpoint
'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/',
'info_dict': {
'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
'ext': 'mp4',
'title': 'De zwarte weduwe',
'description': 'md5:db1227b0f318c849ba5eab1fef895ee4',
'duration': 1457.04,
'thumbnail': r're:^https?://.*\.jpg$',
'season': 'Season 1',
'season_number': 1,
'episode_number': 1,
},
'skip': 'This video is only available for registered users',
'params': {
'username': '<snip>',
'password': '<snip>',
},
'expected_warnings': ['is not a supported codec'],
}, {
# Only available via new API endpoint
'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/',
'info_dict': {
'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1',
'ext': 'mp4',
'title': 'Aflevering 5',
'description': 'Wie valt door de mand tijdens een missie?',
'duration': 2967.06,
'season': 'Season 1',
'season_number': 1,
'episode_number': 5,
},
'skip': 'This video is only available for registered users',
'params': {
'username': '<snip>',
'password': '<snip>',
},
'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
}]
_NETRC_MACHINE = 'vrtnu'
_APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
_CONTEXT_ID = 'R3595707040'
def _real_initialize(self):
self._login()
def _login(self):
username, password = self._get_login_info()
if username is None:
return
auth_data = {
'APIKey': self._APIKEY,
'targetEnv': 'jssdk',
'loginID': username,
'password': password,
'authMode': 'cookie',
}
auth_info = self._gigya_login(auth_data)
# Sometimes authentication fails for no good reason, retry
login_attempt = 1
while login_attempt <= 3:
try:
# When requesting a token, no actual token is returned, but the
# necessary cookies are set.
self._request_webpage(
'https://token.vrt.be',
None, note='Requesting a token', errnote='Could not get a token',
headers={
'Content-Type': 'application/json',
'Referer': 'https://www.vrt.be/vrtnu/',
},
data=json.dumps({
'uid': auth_info['UID'],
'uidsig': auth_info['UIDSignature'],
'ts': auth_info['signatureTimestamp'],
'email': auth_info['profile']['email'],
}).encode('utf-8'))
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
login_attempt += 1
self.report_warning('Authentication failed')
self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again')
else:
raise e
else:
break
def _real_extract(self, url):
display_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(url, display_id)
info = self._search_json_ld(webpage, display_id, default={})
# title is optional here since it may be extracted by extractor
# that is delegated from here
title = strip_or_none(self._html_search_regex(
r'(?ms)<h1 class="content__heading">(.+?)</h1>',
webpage, 'title', default=None))
description = self._html_search_regex(
r'(?ms)<div class="content__description">(.+?)</div>',
webpage, 'description', default=None)
season = self._html_search_regex(
[r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s*
<span>seizoen\ (.+?)</span>\s*
</div>''',
r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'],
webpage, 'season', default=None)
season_number = int_or_none(season)
episode_number = int_or_none(self._html_search_regex(
r'''(?xms)<div\ class="content__episode">\s*
<abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span>
</div>''',
webpage, 'episode_number', default=None))
release_date = parse_iso8601(self._html_search_regex(
r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"',
webpage, 'release_date', default=None))
# If there's a ? or a # in the URL, remove them and everything after
clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/')
securevideo_url = clean_url + '.mssecurevideo.json'
try:
video = self._download_json(securevideo_url, display_id)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
self.raise_login_required()
raise
# We are dealing with a '../<show>.relevant' URL
redirect_url = video.get('url')
if redirect_url:
return self.url_result(self._proto_relative_url(redirect_url, 'https:'))
# There is only one entry, but with an unknown key, so just get
# the first one
video_id = list(video.values())[0].get('videoid')
return merge_dicts(info, {
'_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
'ie_key': CanvasIE.ie_key(),
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'season': season,
'season_number': season_number,
'episode_number': episode_number,
'release_date': release_date,
})

View File

@@ -0,0 +1,108 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
float_or_none,
int_or_none,
try_get,
)
from .videomore import VideomoreIE
class CarambaTVIE(InfoExtractor):
_VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)'
_TESTS = [{
'url': 'http://video1.carambatv.ru/v/191910501',
'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a',
'info_dict': {
'id': '191910501',
'ext': 'mp4',
'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 2678.31,
},
}, {
'url': 'carambatv:191910501',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id,
video_id)
title = video['title']
base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id
formats = [{
'url': base_url + f['fn'],
'height': int_or_none(f.get('height')),
'format_id': '%sp' % f['height'] if f.get('height') else None,
} for f in video['qualities'] if f.get('fn')]
self._sort_formats(formats)
thumbnail = video.get('splash')
duration = float_or_none(try_get(
video, lambda x: x['annotations'][0]['end_time'], compat_str))
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}
class CarambaTVPageIE(InfoExtractor):
_VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/',
'md5': 'a49fb0ec2ad66503eeb46aac237d3c86',
'info_dict': {
'id': '475222',
'ext': 'flv',
'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)',
'thumbnail': r're:^https?://.*\.jpg',
# duration reported by videomore is incorrect
'duration': int,
},
'add_ie': [VideomoreIE.ie_key()],
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
videomore_url = VideomoreIE._extract_url(webpage)
if not videomore_url:
videomore_id = self._search_regex(
r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id',
default=None)
if videomore_id:
videomore_url = 'videomore:%s' % videomore_id
if videomore_url:
title = self._og_search_title(webpage)
return {
'_type': 'url_transparent',
'url': videomore_url,
'ie_key': VideomoreIE.ie_key(),
'title': title,
}
video_url = self._og_search_property('video:iframe', webpage, default=None)
if not video_url:
video_id = self._search_regex(
r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)',
webpage, 'video id')
video_url = 'carambatv:%s' % video_id
return self.url_result(video_url, CarambaTVIE.ie_key())

View File

@@ -0,0 +1,62 @@
# coding: utf-8
from __future__ import unicode_literals
from .turner import TurnerBaseIE
from ..utils import int_or_none
class CartoonNetworkIE(TurnerBaseIE):
_VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html'
_TEST = {
'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html',
'info_dict': {
'id': '6e3375097f63874ebccec7ef677c1c3845fa850e',
'ext': 'mp4',
'title': 'How to Draw Upgrade',
'description': 'md5:2061d83776db7e8be4879684eefe8c0f',
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False):
metadata_re = ''
if content_re:
metadata_re = r'|video_metadata\.content_' + content_re
return self._search_regex(
r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re),
webpage, name, fatal=fatal)
media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True)
title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True)
info = self._extract_ngtv_info(
media_id, {'networkId': 'cartoonnetwork'}, {
'url': url,
'site_name': 'CartoonNetwork',
'auth_required': find_field('authType', 'auth type') != 'unauth',
})
series = find_field(
'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage)
info.update({
'id': media_id,
'display_id': display_id,
'title': title,
'description': self._html_search_meta('description', webpage),
'series': series,
'episode': title,
})
for field in ('season', 'episode'):
field_name = field + 'Number'
info[field + '_number'] = int_or_none(find_field(
field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage))
return info

View File

@@ -0,0 +1,497 @@
# coding: utf-8
from __future__ import unicode_literals
import hashlib
import json
import re
from xml.sax.saxutils import escape
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_HTTPError,
)
from ..utils import (
js_to_json,
smuggle_url,
try_get,
xpath_text,
xpath_element,
xpath_with_ns,
find_xpath_attr,
orderedSet,
parse_duration,
parse_iso8601,
parse_age_limit,
strip_or_none,
int_or_none,
ExtractorError,
)
class CBCIE(InfoExtractor):
IE_NAME = 'cbc.ca'
_VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'
_TESTS = [{
# with mediaId
'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs',
'md5': '97e24d09672fc4cf56256d6faa6c25bc',
'info_dict': {
'id': '2682904050',
'ext': 'mp4',
'title': 'Don Cherry All-Stars',
'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guys got heart.',
'timestamp': 1454463000,
'upload_date': '20160203',
'uploader': 'CBCC-NEW',
},
'skip': 'Geo-restricted to Canada',
}, {
# with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com
'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4',
'md5': '162adfa070274b144f4fdc3c3b8207db',
'info_dict': {
'id': '2414435309',
'ext': 'mp4',
'title': '22 Minutes Update: What Not To Wear Quebec',
'description': "This week's latest Canadian top political story is What Not To Wear Quebec.",
'upload_date': '20131025',
'uploader': 'CBCC-NEW',
'timestamp': 1382717907,
},
}, {
# with clipId, feed only available via tpfeed.cbc.ca
'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
'md5': '0274a90b51a9b4971fe005c63f592f12',
'info_dict': {
'id': '2487345465',
'ext': 'mp4',
'title': 'Robin Williams freestyles on 90 Minutes Live',
'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.',
'upload_date': '19780210',
'uploader': 'CBCC-NEW',
'timestamp': 255977160,
},
}, {
# multiple iframes
'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot',
'playlist': [{
'md5': '377572d0b49c4ce0c9ad77470e0b96b4',
'info_dict': {
'id': '2680832926',
'ext': 'mp4',
'title': 'An Eagle\'s-Eye View Off Burrard Bridge',
'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.',
'upload_date': '20160201',
'timestamp': 1454342820,
'uploader': 'CBCC-NEW',
},
}, {
'md5': '415a0e3f586113894174dfb31aa5bb1a',
'info_dict': {
'id': '2658915080',
'ext': 'mp4',
'title': 'Fly like an eagle!',
'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower',
'upload_date': '20150315',
'timestamp': 1426443984,
'uploader': 'CBCC-NEW',
},
}],
'skip': 'Geo-restricted to Canada',
}, {
# multiple CBC.APP.Caffeine.initInstance(...)
'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238',
'info_dict': {
'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks',
'id': 'dog-indoor-exercise-winter-1.3928238',
'description': 'md5:c18552e41726ee95bd75210d1ca9194c',
},
'playlist_mincount': 6,
}]
@classmethod
def suitable(cls, url):
return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url)
def _extract_player_init(self, player_init, display_id):
player_info = self._parse_json(player_init, display_id, js_to_json)
media_id = player_info.get('mediaId')
if not media_id:
clip_id = player_info['clipId']
feed = self._download_json(
'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id,
clip_id, fatal=False)
if feed:
media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str)
if not media_id:
media_id = self._download_json(
'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id,
clip_id)['entries'][0]['id'].split('/')[-1]
return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
title = self._og_search_title(webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title', default=None) or self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
entries = [
self._extract_player_init(player_init, display_id)
for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
media_ids = []
for media_id_re in (
r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"',
r'<div[^>]+\bid=["\']player-(\d+)',
r'guid["\']\s*:\s*["\'](\d+)'):
media_ids.extend(re.findall(media_id_re, webpage))
entries.extend([
self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
for media_id in orderedSet(media_ids)])
return self.playlist_result(
entries, display_id, strip_or_none(title),
self._og_search_description(webpage))
class CBCPlayerIE(InfoExtractor):
IE_NAME = 'cbc.ca:player'
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.cbc.ca/player/play/2683190193',
'md5': '64d25f841ddf4ddb28a235338af32e2c',
'info_dict': {
'id': '2683190193',
'ext': 'mp4',
'title': 'Gerry Runs a Sweat Shop',
'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0',
'timestamp': 1455071400,
'upload_date': '20160210',
'uploader': 'CBCC-NEW',
},
'skip': 'Geo-restricted to Canada',
}, {
# Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
'url': 'http://www.cbc.ca/player/play/2657631896',
'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
'info_dict': {
'id': '2657631896',
'ext': 'mp3',
'title': 'CBC Montreal is organizing its first ever community hackathon!',
'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.',
'timestamp': 1425704400,
'upload_date': '20150307',
'uploader': 'CBCC-NEW',
},
}, {
'url': 'http://www.cbc.ca/player/play/2164402062',
'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
'info_dict': {
'id': '2164402062',
'ext': 'mp4',
'title': 'Cancer survivor four times over',
'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
'timestamp': 1320410746,
'upload_date': '20111104',
'uploader': 'CBCC-NEW',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
'url': smuggle_url(
'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, {
'force_smil_url': True
}),
'id': video_id,
}
class CBCWatchBaseIE(InfoExtractor):
_device_id = None
_device_token = None
_API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/'
_NS_MAP = {
'media': 'http://search.yahoo.com/mrss/',
'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/',
}
_GEO_COUNTRIES = ['CA']
_LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login'
_TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token'
_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37'
_NETRC_MACHINE = 'cbcwatch'
def _signature(self, email, password):
data = json.dumps({
'email': email,
'password': password,
}).encode()
headers = {'content-type': 'application/json'}
query = {'apikey': self._API_KEY}
resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query)
access_token = resp['access_token']
# token
query = {
'access_token': access_token,
'apikey': self._API_KEY,
'jwtapp': 'jwt',
}
resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query)
return resp['signature']
def _call_api(self, path, video_id):
url = path if path.startswith('http') else self._API_BASE_URL + path
for _ in range(2):
try:
result = self._download_xml(url, video_id, headers={
'X-Clearleap-DeviceId': self._device_id,
'X-Clearleap-DeviceToken': self._device_token,
})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
# Device token has expired, re-acquiring device token
self._register_device()
continue
raise
error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage')
if error_message:
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message))
return result
def _real_initialize(self):
if self._valid_device_token():
return
device = self._downloader.cache.load(
'cbcwatch', self._cache_device_key()) or {}
self._device_id, self._device_token = device.get('id'), device.get('token')
if self._valid_device_token():
return
self._register_device()
def _valid_device_token(self):
return self._device_id and self._device_token
def _cache_device_key(self):
email, _ = self._get_login_info()
return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device'
def _register_device(self):
result = self._download_xml(
self._API_BASE_URL + 'device/register',
None, 'Acquiring device token',
data=b'<device><type>web</type></device>')
self._device_id = xpath_text(result, 'deviceId', fatal=True)
email, password = self._get_login_info()
if email and password:
signature = self._signature(email, password)
data = '<login><token>{0}</token><device><deviceId>{1}</deviceId><type>web</type></device></login>'.format(
escape(signature), escape(self._device_id)).encode()
url = self._API_BASE_URL + 'device/login'
result = self._download_xml(
url, None, data=data,
headers={'content-type': 'application/xml'})
self._device_token = xpath_text(result, 'token', fatal=True)
else:
self._device_token = xpath_text(result, 'deviceToken', fatal=True)
self._downloader.cache.store(
'cbcwatch', self._cache_device_key(), {
'id': self._device_id,
'token': self._device_token,
})
def _parse_rss_feed(self, rss):
channel = xpath_element(rss, 'channel', fatal=True)
def _add_ns(path):
return xpath_with_ns(path, self._NS_MAP)
entries = []
for item in channel.findall('item'):
guid = xpath_text(item, 'guid', fatal=True)
title = xpath_text(item, 'title', fatal=True)
media_group = xpath_element(item, _add_ns('media:group'), fatal=True)
content = xpath_element(media_group, _add_ns('media:content'), fatal=True)
content_url = content.attrib['url']
thumbnails = []
for thumbnail in media_group.findall(_add_ns('media:thumbnail')):
thumbnail_url = thumbnail.get('url')
if not thumbnail_url:
continue
thumbnails.append({
'id': thumbnail.get('profile'),
'url': thumbnail_url,
'width': int_or_none(thumbnail.get('width')),
'height': int_or_none(thumbnail.get('height')),
})
timestamp = None
release_date = find_xpath_attr(
item, _add_ns('media:credit'), 'role', 'releaseDate')
if release_date is not None:
timestamp = parse_iso8601(release_date.text)
entries.append({
'_type': 'url_transparent',
'url': content_url,
'id': guid,
'title': title,
'description': xpath_text(item, 'description'),
'timestamp': timestamp,
'duration': int_or_none(content.get('duration')),
'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))),
'episode': xpath_text(item, _add_ns('clearleap:episode')),
'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))),
'series': xpath_text(item, _add_ns('clearleap:series')),
'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))),
'thumbnails': thumbnails,
'ie_key': 'CBCWatchVideo',
})
return self.playlist_result(
entries, xpath_text(channel, 'guid'),
xpath_text(channel, 'title'),
xpath_text(channel, 'description'))
class CBCWatchVideoIE(CBCWatchBaseIE):
IE_NAME = 'cbc.ca:watch:video'
_VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TEST = {
# geo-restricted to Canada, bypassable
'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235',
'only_matching': True,
}
def _real_extract(self, url):
video_id = self._match_id(url)
result = self._call_api(url, video_id)
m3u8_url = xpath_text(result, 'url', fatal=True)
formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False)
if len(formats) < 2:
formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
for f in formats:
format_id = f.get('format_id')
if format_id.startswith('AAC'):
f['acodec'] = 'aac'
elif format_id.startswith('AC3'):
f['acodec'] = 'ac-3'
self._sort_formats(formats)
info = {
'id': video_id,
'title': video_id,
'formats': formats,
}
rss = xpath_element(result, 'rss')
if rss:
info.update(self._parse_rss_feed(rss)['entries'][0])
del info['url']
del info['_type']
del info['ie_key']
return info
class CBCWatchIE(CBCWatchBaseIE):
IE_NAME = 'cbc.ca:watch'
_VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'
_TESTS = [{
# geo-restricted to Canada, bypassable
'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4',
'info_dict': {
'id': '9673749a-5e77-484c-8b62-a1092a6b5168',
'ext': 'mp4',
'title': 'Customer (Dis)Service',
'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87',
'upload_date': '20160219',
'timestamp': 1455840000,
},
'params': {
# m3u8 download
'skip_download': True,
'format': 'bestvideo',
},
}, {
# geo-restricted to Canada, bypassable
'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057',
'info_dict': {
'id': '1ed4b385-cd84-49cf-95f0-80f004680057',
'title': 'Arthur',
'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.',
},
'playlist_mincount': 30,
}, {
'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
rss = self._call_api('web/browse/' + video_id, video_id)
return self._parse_rss_feed(rss)
class CBCOlympicsIE(InfoExtractor):
IE_NAME = 'cbc.ca:olympics'
_VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._hidden_inputs(webpage)['videoId']
video_doc = self._download_xml(
'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id)
title = xpath_text(video_doc, 'title', fatal=True)
is_live = xpath_text(video_doc, 'kind') == 'Live'
if is_live:
title = self._live_title(title)
formats = []
for video_source in video_doc.findall('videoSources/videoSource'):
uri = xpath_text(video_source, 'uri')
if not uri:
continue
tokenize = self._download_json(
'https://olympics.cbc.ca/api/api-akamai/tokenize',
video_id, data=json.dumps({
'VideoSource': uri,
}).encode(), headers={
'Content-Type': 'application/json',
'Referer': url,
# d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js
'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie
}, fatal=False)
if not tokenize:
continue
content_url = tokenize['ContentUrl']
video_source_format = video_source.get('format')
if video_source_format == 'IIS':
formats.extend(self._extract_ism_formats(
content_url, video_id, ism_id=video_source_format, fatal=False))
else:
formats.extend(self._extract_m3u8_formats(
content_url, video_id, 'mp4',
'm3u8' if is_live else 'm3u8_native',
m3u8_id=video_source_format, fatal=False))
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': xpath_text(video_doc, 'description'),
'thumbnail': xpath_text(video_doc, 'thumbnailUrl'),
'duration': parse_duration(xpath_text(video_doc, 'duration')),
'formats': formats,
'is_live': is_live,
}

View File

@@ -0,0 +1,112 @@
from __future__ import unicode_literals
from .theplatform import ThePlatformFeedIE
from ..utils import (
ExtractorError,
int_or_none,
find_xpath_attr,
xpath_element,
xpath_text,
update_url_query,
)
class CBSBaseIE(ThePlatformFeedIE):
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
subtitles = {}
for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]:
cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k)
if cc_e is not None:
cc_url = cc_e.get('value')
if cc_url:
subtitles.setdefault(subtitles_lang, []).append({
'ext': ext,
'url': cc_url,
})
return subtitles
class CBSIE(CBSBaseIE):
_VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
_TESTS = [{
'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
'info_dict': {
'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_',
'ext': 'mp4',
'title': 'Connect Chat feat. Garth Brooks',
'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
'duration': 1495,
'timestamp': 1385585425,
'upload_date': '20131127',
'uploader': 'CBSI-NEW',
},
'params': {
# m3u8 download
'skip_download': True,
},
'_skip': 'Blocked outside the US',
}, {
'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
'only_matching': True,
}, {
'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
'only_matching': True,
}]
def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517):
items_data = self._download_xml(
'http://can.cbs.com/thunder/player/videoPlayerService.php',
content_id, query={'partner': site, 'contentId': content_id})
video_data = xpath_element(items_data, './/item')
title = xpath_text(video_data, 'videoTitle', 'title', True)
tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id)
tp_release_url = 'http://link.theplatform.com/s/' + tp_path
asset_types = []
subtitles = {}
formats = []
last_e = None
for item in items_data.findall('.//item'):
asset_type = xpath_text(item, 'assetType')
if not asset_type or asset_type in asset_types or 'HLS_FPS' in asset_type or 'DASH_CENC' in asset_type:
continue
asset_types.append(asset_type)
query = {
'mbr': 'true',
'assetTypes': asset_type,
}
if asset_type.startswith('HLS') or asset_type in ('OnceURL', 'StreamPack'):
query['formats'] = 'MPEG4,M3U'
elif asset_type in ('RTMP', 'WIFI', '3G'):
query['formats'] = 'MPEG4,FLV'
try:
tp_formats, tp_subtitles = self._extract_theplatform_smil(
update_url_query(tp_release_url, query), content_id,
'Downloading %s SMIL data' % asset_type)
except ExtractorError as e:
last_e = e
continue
formats.extend(tp_formats)
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
if last_e and not formats:
raise last_e
self._sort_formats(formats)
info = self._extract_theplatform_metadata(tp_path, content_id)
info.update({
'id': content_id,
'title': title,
'series': xpath_text(video_data, 'seriesTitle'),
'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000),
'thumbnail': xpath_text(video_data, 'previewImageURL'),
'formats': formats,
'subtitles': subtitles,
})
return info
def _real_extract(self, url):
content_id = self._match_id(url)
return self._extract_video_info(content_id)

View File

@@ -0,0 +1,103 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .cbs import CBSIE
from ..utils import int_or_none
class CBSInteractiveIE(CBSIE):
_VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P<id>[^/?]+)'
_TESTS = [{
'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
'info_dict': {
'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00',
'display_id': 'hands-on-with-microsofts-windows-8-1-update',
'ext': 'mp4',
'title': 'Hands-on with Microsoft Windows 8.1 Update',
'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',
'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861',
'uploader': 'Sarah Mitroff',
'duration': 70,
'timestamp': 1396479627,
'upload_date': '20140402',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',
'md5': 'f11d27b2fa18597fbf92444d2a9ed386',
'info_dict': {
'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK',
'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187',
'ext': 'mp4',
'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)',
'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f',
'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',
'uploader': 'Ashley Esqueda',
'duration': 1482,
'timestamp': 1433289889,
'upload_date': '20150603',
},
}, {
'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/',
'info_dict': {
'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt',
'display_id': 'video-keeping-android-smartphones-and-tablets-secure',
'ext': 'mp4',
'title': 'Video: Keeping Android smartphones and tablets secure',
'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.',
'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0',
'uploader': 'Adrian Kingsley-Hughes',
'duration': 731,
'timestamp': 1449129925,
'upload_date': '20151203',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/',
'only_matching': True,
}]
MPX_ACCOUNTS = {
'cnet': 2198311517,
'zdnet': 2387448114,
}
def _real_extract(self, url):
site, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
data_json = self._html_search_regex(
r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'",
webpage, 'data json')
data = self._parse_json(data_json, display_id)
vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0]
video_id = vdata['mpxRefId']
title = vdata['title']
author = vdata.get('author')
if author:
uploader = '%s %s' % (author['firstName'], author['lastName'])
uploader_id = author.get('id')
else:
uploader = None
uploader_id = None
info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site])
info.update({
'id': video_id,
'display_id': display_id,
'title': title,
'duration': int_or_none(vdata.get('duration')),
'uploader': uploader,
'uploader_id': uploader_id,
})
return info

View File

@@ -0,0 +1,104 @@
# coding: utf-8
from __future__ import unicode_literals
from .anvato import AnvatoIE
from .sendtonews import SendtoNewsIE
from ..compat import compat_urlparse
from ..utils import (
parse_iso8601,
unified_timestamp,
)
class CBSLocalIE(AnvatoIE):
_VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P<id>[0-9a-z-]+)'
_TESTS = [{
# Anvato backend
'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis',
'md5': 'f0ee3081e3843f575fccef901199b212',
'info_dict': {
'id': '3401037',
'ext': 'mp4',
'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'',
'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.',
'thumbnail': 're:^https?://.*',
'timestamp': 1463440500,
'upload_date': '20160516',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
},
'categories': [
'Stations\\Spoken Word\\KCBSTV',
'Syndication\\MSN',
'Syndication\\NDN',
'Syndication\\AOL',
'Syndication\\Yahoo',
'Syndication\\Tribune',
'Syndication\\Curb.tv',
'Content\\News'
],
'tags': ['CBS 2 News Evening'],
},
}, {
# SendtoNews embed
'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/',
'info_dict': {
'id': 'GxfCe0Zo7D-175909-5588',
},
'playlist_count': 9,
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
'info_dict': {
'id': '3580809',
'ext': 'mp4',
'title': 'A Very Blue Anniversary',
'description': 'CBS2s Cindy Hsu has more.',
'thumbnail': 're:^https?://.*',
'timestamp': int,
'upload_date': r're:^\d{8}$',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
},
'categories': [
'Stations\\Spoken Word\\WCBSTV',
'Syndication\\AOL',
'Syndication\\MSN',
'Syndication\\NDN',
'Syndication\\Yahoo',
'Content\\News',
'Content\\News\\Local News',
],
'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
sendtonews_url = SendtoNewsIE._extract_url(webpage)
if sendtonews_url:
return self.url_result(
compat_urlparse.urljoin(url, sendtonews_url),
ie=SendtoNewsIE.ie_key())
info_dict = self._extract_anvato_videos(webpage, display_id)
timestamp = unified_timestamp(self._html_search_regex(
r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage,
'released date', default=None)) or parse_iso8601(
self._html_search_meta('uploadDate', webpage))
info_dict.update({
'display_id': display_id,
'timestamp': timestamp,
})
return info_dict

View File

@@ -0,0 +1,147 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import zlib
from .common import InfoExtractor
from .cbs import CBSIE
from ..compat import (
compat_b64decode,
compat_urllib_parse_unquote,
)
from ..utils import (
parse_duration,
)
class CBSNewsEmbedIE(CBSIE):
IE_NAME = 'cbsnews:embed'
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P<id>.+)'
_TESTS = [{
'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A',
'only_matching': True,
}]
def _real_extract(self, url):
item = self._parse_json(zlib.decompress(compat_b64decode(
compat_urllib_parse_unquote(self._match_id(url))),
-zlib.MAX_WBITS), None)['video']['items'][0]
return self._extract_video_info(item['mpxRefId'], 'cbsnews')
class CBSNewsIE(CBSIE):
IE_NAME = 'cbsnews'
IE_DESC = 'CBS News'
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\da-z_-]+)'
_TESTS = [
{
# 60 minutes
'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/',
'info_dict': {
'id': 'Y_nf_aEg6WwO9OLAq0MpKaPgfnBUxfW4',
'ext': 'flv',
'title': 'Artificial Intelligence, real-life applications',
'description': 'md5:a7aaf27f1b4777244de8b0b442289304',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 317,
'uploader': 'CBSI-NEW',
'timestamp': 1476046464,
'upload_date': '20161009',
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
'info_dict': {
'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y',
'ext': 'mp4',
'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
'description': 'md5:4a6983e480542d8b333a947bfc64ddc7',
'upload_date': '20140404',
'timestamp': 1396650660,
'uploader': 'CBSI-NEW',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 205,
'subtitles': {
'en': [{
'ext': 'ttml',
}],
},
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
# 48 hours
'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/',
'info_dict': {
'title': 'Cold as Ice',
'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?',
},
'playlist_mincount': 7,
},
]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
entries = []
for embed_url in re.findall(r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage):
entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key()))
if entries:
return self.playlist_result(
entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage),
playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage))
item = self._parse_json(self._html_search_regex(
r'CBSNEWS\.defaultPayload\s*=\s*({.+})',
webpage, 'video JSON info'), display_id)['items'][0]
return self._extract_video_info(item['mpxRefId'], 'cbsnews')
class CBSNewsLiveVideoIE(InfoExtractor):
IE_NAME = 'cbsnews:livevideo'
IE_DESC = 'CBS News Live Videos'
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[^/?#]+)'
# Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples
_TEST = {
'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/',
'info_dict': {
'id': 'clinton-sanders-prepare-to-face-off-in-nh',
'ext': 'mp4',
'title': 'Clinton, Sanders Prepare To Face Off In NH',
'duration': 334,
},
'skip': 'Video gone',
}
def _real_extract(self, url):
display_id = self._match_id(url)
video_info = self._download_json(
'http://feeds.cbsn.cbsnews.com/rundown/story', display_id, query={
'device': 'desktop',
'dvr_slug': display_id,
})
formats = self._extract_akamai_formats(video_info['url'], display_id)
self._sort_formats(formats)
return {
'id': display_id,
'display_id': display_id,
'title': video_info['headline'],
'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'),
'duration': parse_duration(video_info.get('segmentDur')),
'formats': formats,
}

View File

@@ -0,0 +1,38 @@
from __future__ import unicode_literals
from .cbs import CBSBaseIE
class CBSSportsIE(CBSBaseIE):
_VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/',
'info_dict': {
'id': '1214315075735',
'ext': 'mp4',
'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder',
'description': 'md5:df6f48622612c2d6bd2e295ddef58def',
'timestamp': 1524111457,
'upload_date': '20180419',
'uploader': 'CBSI-NEW',
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/',
'only_matching': True,
}]
def _extract_video_info(self, filter_query, video_id):
return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
[r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'],
webpage, 'video id')
return self._extract_video_info('byId=%s' % video_id, video_id)

View File

@@ -0,0 +1,111 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
try_get,
url_or_none,
)
class CCCIE(InfoExtractor):
IE_NAME = 'media.ccc.de'
_VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/v/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video',
'md5': '3a1eda8f3a29515d27f5adb967d7e740',
'info_dict': {
'id': '1839',
'ext': 'mp4',
'title': 'Introduction to Processor Design',
'creator': 'byterazor',
'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20131228',
'timestamp': 1388188800,
'duration': 3710,
'tags': list,
}
}, {
'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
event_id = self._search_regex(r"data-id='(\d+)'", webpage, 'event id')
event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id)
formats = []
for recording in event_data.get('recordings', []):
recording_url = recording.get('recording_url')
if not recording_url:
continue
language = recording.get('language')
folder = recording.get('folder')
format_id = None
if language:
format_id = language
if folder:
if language:
format_id += '-' + folder
else:
format_id = folder
vcodec = 'h264' if 'h264' in folder else (
'none' if folder in ('mp3', 'opus') else None
)
formats.append({
'format_id': format_id,
'url': recording_url,
'width': int_or_none(recording.get('width')),
'height': int_or_none(recording.get('height')),
'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024),
'language': language,
'vcodec': vcodec,
})
self._sort_formats(formats)
return {
'id': event_id,
'display_id': display_id,
'title': event_data['title'],
'creator': try_get(event_data, lambda x: ', '.join(x['persons'])),
'description': event_data.get('description'),
'thumbnail': event_data.get('thumb_url'),
'timestamp': parse_iso8601(event_data.get('date')),
'duration': int_or_none(event_data.get('length')),
'tags': event_data.get('tags'),
'formats': formats,
}
class CCCPlaylistIE(InfoExtractor):
IE_NAME = 'media.ccc.de:lists'
_VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/c/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://media.ccc.de/c/30c3',
'info_dict': {
'title': '30C3',
'id': '30c3',
},
'playlist_count': 135,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url).lower()
conf = self._download_json(
'https://media.ccc.de/public/conferences/' + playlist_id,
playlist_id)
entries = []
for e in conf['events']:
event_url = url_or_none(e.get('frontend_link'))
if event_url:
entries.append(self.url_result(event_url, ie=CCCIE.ie_key()))
return self.playlist_result(entries, playlist_id, conf.get('title'))

View File

@@ -0,0 +1,109 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
int_or_none,
parse_duration,
parse_iso8601,
parse_resolution,
url_or_none,
)
class CCMAIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?P<type>video|audio)/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/',
'md5': '7296ca43977c8ea4469e719c609b0871',
'info_dict': {
'id': '5630208',
'ext': 'mp4',
'title': 'L\'espot de La Marató de TV3',
'description': 'md5:f12987f320e2f6e988e9908e4fe97765',
'timestamp': 1470918540,
'upload_date': '20160811',
}
}, {
'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/',
'md5': 'fa3e38f269329a278271276330261425',
'info_dict': {
'id': '943685',
'ext': 'mp3',
'title': 'El Consell de Savis analitza el derbi',
'description': 'md5:e2a3648145f3241cb9c6b4b624033e53',
'upload_date': '20171205',
'timestamp': 1512507300,
}
}]
def _real_extract(self, url):
media_type, media_id = re.match(self._VALID_URL, url).groups()
media = self._download_json(
'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={
'media': media_type,
'idint': media_id,
})
formats = []
media_url = media['media']['url']
if isinstance(media_url, list):
for format_ in media_url:
format_url = url_or_none(format_.get('file'))
if not format_url:
continue
label = format_.get('label')
f = parse_resolution(label)
f.update({
'url': format_url,
'format_id': label,
})
formats.append(f)
else:
formats.append({
'url': media_url,
'vcodec': 'none' if media_type == 'audio' else None,
})
self._sort_formats(formats)
informacio = media['informacio']
title = informacio['titol']
durada = informacio.get('durada', {})
duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text'))
timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc'))
subtitles = {}
subtitols = media.get('subtitols', {})
if subtitols:
sub_url = subtitols.get('url')
if sub_url:
subtitles.setdefault(
subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({
'url': sub_url,
})
thumbnails = []
imatges = media.get('imatges', {})
if imatges:
thumbnail_url = imatges.get('url')
if thumbnail_url:
thumbnails = [{
'url': thumbnail_url,
'width': int_or_none(imatges.get('amplada')),
'height': int_or_none(imatges.get('alcada')),
}]
return {
'id': media_id,
'title': title,
'description': clean_html(informacio.get('descripcio')),
'duration': duration,
'timestamp': timestamp,
'thumbnails': thumbnails,
'subtitles': subtitles,
'formats': formats,
}

View File

@@ -0,0 +1,191 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
float_or_none,
try_get,
unified_timestamp,
)
class CCTVIE(InfoExtractor):
IE_DESC = '央视网'
_VALID_URL = r'https?://(?:(?:[^/]+)\.(?:cntv|cctv)\.(?:com|cn)|(?:www\.)?ncpa-classic\.com)/(?:[^/]+/)*?(?P<id>[^/?#&]+?)(?:/index)?(?:\.s?html|[?#&]|$)'
_TESTS = [{
# fo.addVariable("videoCenterId","id")
'url': 'http://sports.cntv.cn/2016/02/12/ARTIaBRxv4rTT1yWf1frW2wi160212.shtml',
'md5': 'd61ec00a493e09da810bf406a078f691',
'info_dict': {
'id': '5ecdbeab623f4973b40ff25f18b174e8',
'ext': 'mp4',
'title': '[NBA]二少联手砍下46分 雷霆主场击败鹈鹕(快讯)',
'description': 'md5:7e14a5328dc5eb3d1cd6afbbe0574e95',
'duration': 98,
'uploader': 'songjunjie',
'timestamp': 1455279956,
'upload_date': '20160212',
},
}, {
# var guid = "id"
'url': 'http://tv.cctv.com/2016/02/05/VIDEUS7apq3lKrHG9Dncm03B160205.shtml',
'info_dict': {
'id': 'efc5d49e5b3b4ab2b34f3a502b73d3ae',
'ext': 'mp4',
'title': '[赛车]“车王”舒马赫恢复情况成谜(快讯)',
'description': '2月4日蒙特泽莫罗透露了关于“车王”舒马赫恢复情况但情况是否属实遭到了质疑。',
'duration': 37,
'uploader': 'shujun',
'timestamp': 1454677291,
'upload_date': '20160205',
},
'params': {
'skip_download': True,
},
}, {
# changePlayer('id')
'url': 'http://english.cntv.cn/special/four_comprehensives/index.shtml',
'info_dict': {
'id': '4bb9bb4db7a6471ba85fdeda5af0381e',
'ext': 'mp4',
'title': 'NHnews008 ANNUAL POLITICAL SEASON',
'description': 'Four Comprehensives',
'duration': 60,
'uploader': 'zhangyunlei',
'timestamp': 1425385521,
'upload_date': '20150303',
},
'params': {
'skip_download': True,
},
}, {
# loadvideo('id')
'url': 'http://cctv.cntv.cn/lm/tvseries_russian/yilugesanghua/index.shtml',
'info_dict': {
'id': 'b15f009ff45c43968b9af583fc2e04b2',
'ext': 'mp4',
'title': 'Путь,усыпанный космеями Серия 1',
'description': 'Путь, усыпанный космеями',
'duration': 2645,
'uploader': 'renxue',
'timestamp': 1477479241,
'upload_date': '20161026',
},
'params': {
'skip_download': True,
},
}, {
# var initMyAray = 'id'
'url': 'http://www.ncpa-classic.com/2013/05/22/VIDE1369219508996867.shtml',
'info_dict': {
'id': 'a194cfa7f18c426b823d876668325946',
'ext': 'mp4',
'title': '小泽征尔音乐塾 音乐梦想无国界',
'duration': 2173,
'timestamp': 1369248264,
'upload_date': '20130522',
},
'params': {
'skip_download': True,
},
}, {
# var ids = ["id"]
'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml',
'info_dict': {
'id': 'a8606119a4884588a79d81c02abecc16',
'ext': 'mp3',
'title': '来自维也纳的新年贺礼',
'description': 'md5:f13764ae8dd484e84dd4b39d5bcba2a7',
'duration': 1578,
'uploader': 'djy',
'timestamp': 1482942419,
'upload_date': '20161228',
},
'params': {
'skip_download': True,
},
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://ent.cntv.cn/2016/01/18/ARTIjprSSJH8DryTVr5Bx8Wb160118.shtml',
'only_matching': True,
}, {
'url': 'http://tv.cntv.cn/video/C39296/e0210d949f113ddfb38d31f00a4e5c44',
'only_matching': True,
}, {
'url': 'http://english.cntv.cn/2016/09/03/VIDEhnkB5y9AgHyIEVphCEz1160903.shtml',
'only_matching': True,
}, {
'url': 'http://tv.cctv.com/2016/09/07/VIDE5C1FnlX5bUywlrjhxXOV160907.shtml',
'only_matching': True,
}, {
'url': 'http://tv.cntv.cn/video/C39296/95cfac44cabd3ddc4a9438780a4e5c44',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_id = self._search_regex(
[r'var\s+guid\s*=\s*["\']([\da-fA-F]+)',
r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)',
r'changePlayer\s*\(\s*["\']([\da-fA-F]+)',
r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)',
r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)',
r'var\s+ids\s*=\s*\[["\']([\da-fA-F]+)'],
webpage, 'video id')
data = self._download_json(
'http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do', video_id,
query={
'pid': video_id,
'url': url,
'idl': 32,
'idlr': 32,
'modifyed': 'false',
})
title = data['title']
formats = []
video = data.get('video')
if isinstance(video, dict):
for quality, chapters_key in enumerate(('lowChapters', 'chapters')):
video_url = try_get(
video, lambda x: x[chapters_key][0]['url'], compat_str)
if video_url:
formats.append({
'url': video_url,
'format_id': 'http',
'quality': quality,
'preference': -1,
})
hls_url = try_get(data, lambda x: x['hls_url'], compat_str)
if hls_url:
hls_url = re.sub(r'maxbr=\d+&?', '', hls_url)
formats.extend(self._extract_m3u8_formats(
hls_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
self._sort_formats(formats)
uploader = data.get('editer_name')
description = self._html_search_meta(
'description', webpage, default=None)
timestamp = unified_timestamp(data.get('f_pgmtime'))
duration = float_or_none(try_get(video, lambda x: x['totalLength']))
return {
'id': video_id,
'title': title,
'description': description,
'uploader': uploader,
'timestamp': timestamp,
'duration': duration,
'formats': formats,
}

View File

@@ -0,0 +1,182 @@
# coding: utf-8
from __future__ import unicode_literals
import codecs
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
multipart_encode,
parse_duration,
random_birthday,
urljoin,
)
class CDAIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
_BASE_URL = 'http://www.cda.pl/'
_TESTS = [{
'url': 'http://www.cda.pl/video/5749950c',
'md5': '6f844bf51b15f31fae165365707ae970',
'info_dict': {
'id': '5749950c',
'ext': 'mp4',
'height': 720,
'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
'description': 'md5:269ccd135d550da90d1662651fcb9772',
'thumbnail': r're:^https?://.*\.jpg$',
'average_rating': float,
'duration': 39,
'age_limit': 0,
}
}, {
'url': 'http://www.cda.pl/video/57413289',
'md5': 'a88828770a8310fc00be6c95faf7f4d5',
'info_dict': {
'id': '57413289',
'ext': 'mp4',
'title': 'Lądowanie na lotnisku na Maderze',
'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'crash404',
'view_count': int,
'average_rating': float,
'duration': 137,
'age_limit': 0,
}
}, {
# Age-restricted
'url': 'http://www.cda.pl/video/1273454c4',
'info_dict': {
'id': '1273454c4',
'ext': 'mp4',
'title': 'Bronson (2008) napisy HD 1080p',
'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
'height': 1080,
'uploader': 'boniek61',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 5554,
'age_limit': 18,
'view_count': int,
'average_rating': float,
},
}, {
'url': 'http://ebd.cda.pl/0x0/5749950c',
'only_matching': True,
}]
def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
form_data = random_birthday('rok', 'miesiac', 'dzien')
form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
data, content_type = multipart_encode(form_data)
return self._download_webpage(
urljoin(url, '/a/validatebirth'), video_id, *args,
data=data, headers={
'Referer': url,
'Content-Type': content_type,
}, **kwargs)
def _real_extract(self, url):
video_id = self._match_id(url)
self._set_cookie('cda.pl', 'cda.player', 'html5')
webpage = self._download_webpage(
self._BASE_URL + '/video/' + video_id, video_id)
if 'Ten film jest dostępny dla użytkowników premium' in webpage:
raise ExtractorError('This video is only available for premium users.', expected=True)
need_confirm_age = False
if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")',
webpage, 'birthday validate form', default=None):
webpage = self._download_age_confirm_page(
url, video_id, note='Confirming age')
need_confirm_age = True
formats = []
uploader = self._search_regex(r'''(?x)
<(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
(?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
<(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
''', webpage, 'uploader', default=None, group='uploader')
view_count = self._search_regex(
r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
'view_count', default=None)
average_rating = self._search_regex(
r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
webpage, 'rating', fatal=False, group='rating_value')
info_dict = {
'id': video_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'uploader': uploader,
'view_count': int_or_none(view_count),
'average_rating': float_or_none(average_rating),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
'duration': None,
'age_limit': 18 if need_confirm_age else 0,
}
def extract_format(page, version):
json_str = self._html_search_regex(
r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
'%s player_json' % version, fatal=False, group='player_data')
if not json_str:
return
player_data = self._parse_json(
json_str, '%s player_data' % version, fatal=False)
if not player_data:
return
video = player_data.get('video')
if not video or 'file' not in video:
self.report_warning('Unable to extract %s version information' % version)
return
if video['file'].startswith('uggc'):
video['file'] = codecs.decode(video['file'], 'rot_13')
if video['file'].endswith('adc.mp4'):
video['file'] = video['file'].replace('adc.mp4', '.mp4')
f = {
'url': video['file'],
}
m = re.search(
r'<a[^>]+data-quality="(?P<format_id>[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P<height>[0-9]+)p',
page)
if m:
f.update({
'format_id': m.group('format_id'),
'height': int(m.group('height')),
})
info_dict['formats'].append(f)
if not info_dict['duration']:
info_dict['duration'] = parse_duration(video.get('duration'))
extract_format(webpage, 'default')
for href, resolution in re.findall(
r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
webpage):
if need_confirm_age:
handler = self._download_age_confirm_page
else:
handler = self._download_webpage
webpage = handler(
self._BASE_URL + href, video_id,
'Downloading %s version information' % resolution, fatal=False)
if not webpage:
# Manually report warning because empty page is returned when
# invalid version is requested.
self.report_warning('Unable to download %s version information' % resolution)
continue
extract_format(webpage, resolution)
self._sort_formats(formats)
return info_dict

View File

@@ -0,0 +1,289 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
)
from ..utils import (
ExtractorError,
float_or_none,
sanitized_Request,
unescapeHTML,
update_url_query,
urlencode_postdata,
USER_AGENTS,
)
class CeskaTelevizeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
_TESTS = [{
'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
'info_dict': {
'id': '61924494877246241',
'ext': 'mp4',
'title': 'Hyde Park Civilizace: Život v Grónsku',
'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 3350,
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
'info_dict': {
'id': '61924494877028507',
'ext': 'mp4',
'title': 'Hyde Park Civilizace: Bonus 01 - En',
'description': 'English Subtittles',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 81.3,
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
# live stream
'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
'info_dict': {
'id': 402,
'ext': 'mp4',
'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'is_live': True,
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Georestricted to Czech Republic',
}, {
'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
'only_matching': True,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
type_ = None
episode_id = None
playlist = self._parse_json(
self._search_regex(
r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
default='{}'), playlist_id)
if playlist:
type_ = playlist.get('type')
episode_id = playlist.get('id')
if not type_:
type_ = self._html_search_regex(
r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
webpage, 'type')
if not episode_id:
episode_id = self._html_search_regex(
r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
webpage, 'episode_id')
data = {
'playlist[0][type]': type_,
'playlist[0][id]': episode_id,
'requestUrl': compat_urllib_parse_urlparse(url).path,
'requestSource': 'iVysilani',
}
entries = []
for user_agent in (None, USER_AGENTS['Safari']):
req = sanitized_Request(
'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
data=urlencode_postdata(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded')
req.add_header('x-addr', '127.0.0.1')
req.add_header('X-Requested-With', 'XMLHttpRequest')
if user_agent:
req.add_header('User-Agent', user_agent)
req.add_header('Referer', url)
playlistpage = self._download_json(req, playlist_id, fatal=False)
if not playlistpage:
continue
playlist_url = playlistpage['url']
if playlist_url == 'error_region':
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
req.add_header('Referer', url)
playlist_title = self._og_search_title(webpage, default=None)
playlist_description = self._og_search_description(webpage, default=None)
playlist = self._download_json(req, playlist_id, fatal=False)
if not playlist:
continue
playlist = playlist.get('playlist')
if not isinstance(playlist, list):
continue
playlist_len = len(playlist)
for num, item in enumerate(playlist):
is_live = item.get('type') == 'LIVE'
formats = []
for format_id, stream_url in item.get('streamUrls', {}).items():
if 'drmOnly=true' in stream_url:
continue
if 'playerType=flash' in stream_url:
stream_formats = self._extract_m3u8_formats(
stream_url, playlist_id, 'mp4', 'm3u8_native',
m3u8_id='hls-%s' % format_id, fatal=False)
else:
stream_formats = self._extract_mpd_formats(
stream_url, playlist_id,
mpd_id='dash-%s' % format_id, fatal=False)
# See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031
if format_id == 'audioDescription':
for f in stream_formats:
f['source_preference'] = -10
formats.extend(stream_formats)
if user_agent and len(entries) == playlist_len:
entries[num]['formats'].extend(formats)
continue
item_id = item.get('id') or item['assetId']
title = item['title']
duration = float_or_none(item.get('duration'))
thumbnail = item.get('previewImageUrl')
subtitles = {}
if item.get('type') == 'VOD':
subs = item.get('subtitles')
if subs:
subtitles = self.extract_subtitles(episode_id, subs)
if playlist_len == 1:
final_title = playlist_title or title
if is_live:
final_title = self._live_title(final_title)
else:
final_title = '%s (%s)' % (playlist_title, title)
entries.append({
'id': item_id,
'title': final_title,
'description': playlist_description if playlist_len == 1 else None,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
'is_live': is_live,
})
for e in entries:
self._sort_formats(e['formats'])
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
def _get_subtitles(self, episode_id, subs):
original_subtitles = self._download_webpage(
subs[0]['url'], episode_id, 'Downloading subtitles')
srt_subs = self._fix_subtitles(original_subtitles)
return {
'cs': [{
'ext': 'srt',
'data': srt_subs,
}]
}
@staticmethod
def _fix_subtitles(subtitles):
""" Convert millisecond-based subtitles to SRT """
def _msectotimecode(msec):
""" Helper utility to convert milliseconds to timecode """
components = []
for divider in [1000, 60, 60, 100]:
components.append(msec % divider)
msec //= divider
return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components)
def _fix_subtitle(subtitle):
for line in subtitle.splitlines():
m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line)
if m:
yield m.group(1)
start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
yield '{0} --> {1}'.format(start, stop)
else:
yield line
return '\r\n'.join(_fix_subtitle(subtitles))
class CeskaTelevizePoradyIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
_TESTS = [{
# video with 18+ caution trailer
'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
'info_dict': {
'id': '215562210900007-bogotart',
'title': 'Queer: Bogotart',
'description': 'Alternativní průvodce současným queer světem',
},
'playlist': [{
'info_dict': {
'id': '61924494876844842',
'ext': 'mp4',
'title': 'Queer: Bogotart (Varování 18+)',
'duration': 10.2,
},
}, {
'info_dict': {
'id': '61924494877068022',
'ext': 'mp4',
'title': 'Queer: Bogotart (Queer)',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 1558.3,
},
}],
'params': {
# m3u8 download
'skip_download': True,
},
}, {
# iframe embed
'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data_url = update_url_query(unescapeHTML(self._search_regex(
(r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'),
webpage, 'iframe player url', group='url')), query={
'autoStart': 'true',
})
return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())

View File

@@ -0,0 +1,262 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
ExtractorError,
int_or_none,
parse_iso8601,
qualities,
unescapeHTML,
)
class Channel9IE(InfoExtractor):
IE_DESC = 'Channel 9'
IE_NAME = 'channel9'
_VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
_TESTS = [{
'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
'md5': '32083d4eaf1946db6d454313f44510ca',
'info_dict': {
'id': '6c413323-383a-49dc-88f9-a22800cab024',
'ext': 'wmv',
'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
'duration': 4576,
'thumbnail': r're:https?://.*\.jpg',
'timestamp': 1377717420,
'upload_date': '20130828',
'session_code': 'KOS002',
'session_room': 'Arena 1A',
'session_speakers': 'count:5',
},
}, {
'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
'info_dict': {
'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
'ext': 'wmv',
'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
'duration': 1540,
'thumbnail': r're:https?://.*\.jpg',
'timestamp': 1386381991,
'upload_date': '20131207',
'authors': ['Mike Wilmot'],
},
}, {
# low quality mp4 is best
'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
'info_dict': {
'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
'ext': 'mp4',
'title': 'Ranges for the Standard Library',
'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
'duration': 5646,
'thumbnail': r're:https?://.*\.jpg',
'upload_date': '20150930',
'timestamp': 1443640735,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
'info_dict': {
'id': 'Events/DEVintersection/DEVintersection-2016',
'title': 'DEVintersection 2016 Orlando Sessions',
},
'playlist_mincount': 14,
}, {
'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
'only_matching': True,
}, {
'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
'only_matching': True,
}]
_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b',
webpage)
def _extract_list(self, video_id, rss_url=None):
if not rss_url:
rss_url = self._RSS_URL % video_id
rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
entries = [self.url_result(session_url.text, 'Channel9')
for session_url in rss.findall('./channel/item/link')]
title_text = rss.find('./channel/title').text
return self.playlist_result(entries, video_id, title_text)
def _real_extract(self, url):
content_path, rss = re.match(self._VALID_URL, url).groups()
if rss:
return self._extract_list(content_path, url)
webpage = self._download_webpage(
url, content_path, 'Downloading web page')
episode_data = self._search_regex(
r"data-episode='([^']+)'", webpage, 'episode data', default=None)
if episode_data:
episode_data = self._parse_json(unescapeHTML(
episode_data), content_path)
content_id = episode_data['contentId']
is_session = '/Sessions(' in episode_data['api']
content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,'
if is_session:
content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers'
else:
content_url += 'Authors,Body&$expand=Authors'
content_data = self._download_json(content_url, content_id)
title = content_data['Title']
QUALITIES = (
'mp3',
'wmv', 'mp4',
'wmv-low', 'mp4-low',
'wmv-mid', 'mp4-mid',
'wmv-high', 'mp4-high',
)
quality_key = qualities(QUALITIES)
def quality(quality_id, format_url):
return (len(QUALITIES) if '_Source.' in format_url
else quality_key(quality_id))
formats = []
urls = set()
SITE_QUALITIES = {
'MP3': 'mp3',
'MP4': 'mp4',
'Low Quality WMV': 'wmv-low',
'Low Quality MP4': 'mp4-low',
'Mid Quality WMV': 'wmv-mid',
'Mid Quality MP4': 'mp4-mid',
'High Quality WMV': 'wmv-high',
'High Quality MP4': 'mp4-high',
}
formats_select = self._search_regex(
r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage,
'formats select', default=None)
if formats_select:
for mobj in re.finditer(
r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<',
formats_select):
format_url = mobj.group('url')
if format_url in urls:
continue
urls.add(format_url)
format_id = mobj.group('format')
quality_id = SITE_QUALITIES.get(format_id, format_id)
formats.append({
'url': format_url,
'format_id': quality_id,
'quality': quality(quality_id, format_url),
'vcodec': 'none' if quality_id == 'mp3' else None,
})
API_QUALITIES = {
'VideoMP4Low': 'mp4-low',
'VideoWMV': 'wmv-mid',
'VideoMP4Medium': 'mp4-mid',
'VideoMP4High': 'mp4-high',
'VideoWMVHQ': 'wmv-hq',
}
for format_id, q in API_QUALITIES.items():
q_url = content_data.get(format_id)
if not q_url or q_url in urls:
continue
urls.add(q_url)
formats.append({
'url': q_url,
'format_id': q,
'quality': quality(q, q_url),
})
self._sort_formats(formats)
slides = content_data.get('Slides')
zip_file = content_data.get('ZipFile')
if not formats and not slides and not zip_file:
raise ExtractorError(
'None of recording, slides or zip are available for %s' % content_path)
subtitles = {}
for caption in content_data.get('Captions', []):
caption_url = caption.get('Url')
if not caption_url:
continue
subtitles.setdefault(caption.get('Language', 'en'), []).append({
'url': caption_url,
'ext': 'vtt',
})
common = {
'id': content_id,
'title': title,
'description': clean_html(content_data.get('Description') or content_data.get('Body')),
'thumbnail': content_data.get('VideoPlayerPreviewImage'),
'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
'timestamp': parse_iso8601(content_data.get('PublishedDate')),
'avg_rating': int_or_none(content_data.get('Rating')),
'rating_count': int_or_none(content_data.get('RatingCount')),
'view_count': int_or_none(content_data.get('Views')),
'comment_count': int_or_none(content_data.get('CommentCount')),
'subtitles': subtitles,
}
if is_session:
speakers = []
for s in content_data.get('Speakers', []):
speaker_name = s.get('FullName')
if not speaker_name:
continue
speakers.append(speaker_name)
common.update({
'session_code': content_data.get('Code'),
'session_room': content_data.get('Room'),
'session_speakers': speakers,
})
else:
authors = []
for a in content_data.get('Authors', []):
author_name = a.get('DisplayName')
if not author_name:
continue
authors.append(author_name)
common['authors'] = authors
contents = []
if slides:
d = common.copy()
d.update({'title': title + '-Slides', 'url': slides})
contents.append(d)
if zip_file:
d = common.copy()
d.update({'title': title + '-Zip', 'url': zip_file})
contents.append(d)
if formats:
d = common.copy()
d.update({'title': title, 'formats': formats})
contents.append(d)
return self.playlist_result(contents)
else:
return self._extract_list(content_path)

View File

@@ -0,0 +1,54 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import remove_end
class CharlieRoseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://charlierose.com/videos/27996',
'md5': 'fda41d49e67d4ce7c2411fd2c4702e09',
'info_dict': {
'id': '27996',
'ext': 'mp4',
'title': 'Remembering Zaha Hadid',
'thumbnail': r're:^https?://.*\.jpg\?\d+',
'description': 'We revisit past conversations with Zaha Hadid, in memory of the world renowned Iraqi architect.',
'subtitles': {
'en': [{
'ext': 'vtt',
}],
},
},
}, {
'url': 'https://charlierose.com/videos/27996',
'only_matching': True,
}, {
'url': 'https://charlierose.com/episodes/30887?autoplay=true',
'only_matching': True,
}]
_PLAYER_BASE = 'https://charlierose.com/video/player/%s'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(self._PLAYER_BASE % video_id, video_id)
title = remove_end(self._og_search_title(webpage), ' - Charlie Rose')
info_dict = self._parse_html5_media_entries(
self._PLAYER_BASE % video_id, webpage, video_id,
m3u8_entry_protocol='m3u8_native')[0]
self._sort_formats(info_dict['formats'])
self._remove_duplicate_formats(info_dict['formats'])
info_dict.update({
'id': video_id,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
})
return info_dict

View File

@@ -0,0 +1,109 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
lowercase_escape,
url_or_none,
)
class ChaturbateIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://www.chaturbate.com/siswet19/',
'info_dict': {
'id': 'siswet19',
'ext': 'mp4',
'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'age_limit': 18,
'is_live': True,
},
'params': {
'skip_download': True,
},
'skip': 'Room is offline',
}, {
'url': 'https://chaturbate.com/fullvideo/?b=caylin',
'only_matching': True,
}, {
'url': 'https://en.chaturbate.com/siswet19/',
'only_matching': True,
}]
_ROOM_OFFLINE = 'Room is currently offline'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'https://chaturbate.com/%s/' % video_id, video_id,
headers=self.geo_verification_headers())
found_m3u8_urls = []
data = self._parse_json(
self._search_regex(
r'initialRoomDossier\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
webpage, 'data', default='{}', group='value'),
video_id, transform_source=lowercase_escape, fatal=False)
if data:
m3u8_url = url_or_none(data.get('hls_source'))
if m3u8_url:
found_m3u8_urls.append(m3u8_url)
if not found_m3u8_urls:
for m in re.finditer(
r'(\\u002[27])(?P<url>http.+?\.m3u8.*?)\1', webpage):
found_m3u8_urls.append(lowercase_escape(m.group('url')))
if not found_m3u8_urls:
for m in re.finditer(
r'(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage):
found_m3u8_urls.append(m.group('url'))
m3u8_urls = []
for found_m3u8_url in found_m3u8_urls:
m3u8_fast_url, m3u8_no_fast_url = found_m3u8_url, found_m3u8_url.replace('_fast', '')
for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url):
if m3u8_url not in m3u8_urls:
m3u8_urls.append(m3u8_url)
if not m3u8_urls:
error = self._search_regex(
[r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>',
r'<div[^>]+id=(["\'])defchat\1[^>]*>\s*<p><strong>(?P<error>[^<]+)<'],
webpage, 'error', group='error', default=None)
if not error:
if any(p in webpage for p in (
self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')):
error = self._ROOM_OFFLINE
if error:
raise ExtractorError(error, expected=True)
raise ExtractorError('Unable to find stream URL')
formats = []
for m3u8_url in m3u8_urls:
for known_id in ('fast', 'slow'):
if '_%s' % known_id in m3u8_url:
m3u8_id = known_id
break
else:
m3u8_id = None
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4',
# ffmpeg skips segments for fast m3u8
preference=-10 if m3u8_id == 'fast' else None,
m3u8_id=m3u8_id, fatal=False, live=True))
self._sort_formats(formats)
return {
'id': video_id,
'title': self._live_title(video_id),
'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id,
'age_limit': self._rta_search(webpage),
'is_live': True,
'formats': formats,
}

View File

@@ -0,0 +1,96 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import compat_b64decode
from ..utils import (
clean_html,
ExtractorError
)
class ChilloutzoneIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html'
_TESTS = [{
'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html',
'md5': 'a76f3457e813ea0037e5244f509e66d1',
'info_dict': {
'id': 'enemene-meck-alle-katzen-weg',
'ext': 'mp4',
'title': 'Enemene Meck - Alle Katzen weg',
'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?',
},
}, {
'note': 'Video hosted at YouTube',
'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html',
'info_dict': {
'id': '1YVQaAgHyRU',
'ext': 'mp4',
'title': '16 Photos Taken 1 Second Before Disaster',
'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814',
'uploader': 'BuzzFeedVideo',
'uploader_id': 'BuzzFeedVideo',
'upload_date': '20131105',
},
}, {
'note': 'Video hosted at Vimeo',
'url': 'http://www.chilloutzone.net/video/icon-blending.html',
'md5': '2645c678b8dc4fefcc0e1b60db18dac1',
'info_dict': {
'id': '85523671',
'ext': 'mp4',
'title': 'The Sunday Times - Icons',
'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}',
'uploader': 'Us',
'uploader_id': 'usfilms',
'upload_date': '20140131'
},
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
base64_video_info = self._html_search_regex(
r'var cozVidData = "(.+?)";', webpage, 'video data')
decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8')
video_info_dict = json.loads(decoded_video_info)
# get video information from dict
video_url = video_info_dict['mediaUrl']
description = clean_html(video_info_dict.get('description'))
title = video_info_dict['title']
native_platform = video_info_dict['nativePlatform']
native_video_id = video_info_dict['nativeVideoId']
source_priority = video_info_dict['sourcePriority']
# If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)
if native_platform is None:
youtube_url = YoutubeIE._extract_url(webpage)
if youtube_url:
return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
# Non Fallback: Decide to use native source (e.g. youtube or vimeo) or
# the own CDN
if source_priority == 'native':
if native_platform == 'youtube':
return self.url_result(native_video_id, ie='Youtube')
if native_platform == 'vimeo':
return self.url_result(
'http://vimeo.com/' + native_video_id, ie='Vimeo')
if not video_url:
raise ExtractorError('No video found')
return {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
'description': description,
}

View File

@@ -0,0 +1,91 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_b64decode
from ..utils import parse_duration
class ChirbitIE(InfoExtractor):
IE_NAME = 'chirbit'
_VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)'
_TESTS = [{
'url': 'http://chirb.it/be2abG',
'info_dict': {
'id': 'be2abG',
'ext': 'mp3',
'title': 'md5:f542ea253f5255240be4da375c6a5d7e',
'description': 'md5:f24a4e22a71763e32da5fed59e47c770',
'duration': 306,
'uploader': 'Gerryaudio',
},
'params': {
'skip_download': True,
}
}, {
'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5',
'only_matching': True,
}, {
'url': 'https://chirb.it/wp/MN58c2',
'only_matching': True,
}]
def _real_extract(self, url):
audio_id = self._match_id(url)
webpage = self._download_webpage(
'http://chirb.it/%s' % audio_id, audio_id)
data_fd = self._search_regex(
r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage, 'data fd', group='url')
# Reverse engineered from https://chirb.it/js/chirbit.player.js (look
# for soundURL)
audio_url = compat_b64decode(data_fd[::-1]).decode('utf-8')
title = self._search_regex(
r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title')
description = self._search_regex(
r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>',
webpage, 'description', default=None)
duration = parse_duration(self._search_regex(
r'class=["\']c-length["\'][^>]*>([^<]+)',
webpage, 'duration', fatal=False))
uploader = self._search_regex(
r'id=["\']chirbit-username["\'][^>]*>([^<]+)',
webpage, 'uploader', fatal=False)
return {
'id': audio_id,
'url': audio_url,
'title': title,
'description': description,
'duration': duration,
'uploader': uploader,
}
class ChirbitProfileIE(InfoExtractor):
IE_NAME = 'chirbit:profile'
_VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P<id>[^/]+)'
_TEST = {
'url': 'http://chirbit.com/ScarletBeauty',
'info_dict': {
'id': 'ScarletBeauty',
},
'playlist_mincount': 3,
}
def _real_extract(self, url):
profile_id = self._match_id(url)
webpage = self._download_webpage(url, profile_id)
entries = [
self.url_result(self._proto_relative_url('//chirb.it/' + video_id))
for _, video_id in re.findall(r'<input[^>]+id=([\'"])copy-btn-(?P<id>[0-9a-zA-Z]+)\1', webpage)]
return self.playlist_result(entries, profile_id)

View File

@@ -0,0 +1,58 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
unified_strdate,
xpath_text,
)
class CinchcastIE(InfoExtractor):
_VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single',
'info_dict': {
'id': '5258197',
'ext': 'mp3',
'title': 'Train Your Brain to Up Your Game with Coach Mandy',
'upload_date': '20130816',
},
}, {
# Actual test is run in generic, look for undergroundwellness
'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
doc = self._download_xml(
'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id,
video_id)
item = doc.find('.//item')
title = xpath_text(item, './title', fatal=True)
date_str = xpath_text(
item, './{http://developer.longtailvideo.com/trac/}date')
upload_date = unified_strdate(date_str, day_first=False)
# duration is present but wrong
formats = [{
'format_id': 'main',
'url': item.find('./{http://search.yahoo.com/mrss/}content').attrib['url'],
}]
backup_url = xpath_text(
item, './{http://developer.longtailvideo.com/trac/}backupContent')
if backup_url:
formats.append({
'preference': 2, # seems to be more reliable
'format_id': 'backup',
'url': backup_url,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'upload_date': upload_date,
'formats': formats,
}

View File

@@ -0,0 +1,29 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .hbo import HBOBaseIE
class CinemaxIE(HBOBaseIE):
_VALID_URL = r'https?://(?:www\.)?cinemax\.com/(?P<path>[^/]+/video/[0-9a-z-]+-(?P<id>\d+))'
_TESTS = [{
'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903',
'md5': '82e0734bba8aa7ef526c9dd00cf35a05',
'info_dict': {
'id': '20126903',
'ext': 'mp4',
'title': 'S1 Ep 1: Recap',
},
'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'],
}, {
'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903.embed',
'only_matching': True,
}]
def _real_extract(self, url):
path, video_id = re.match(self._VALID_URL, url).groups()
info = self._extract_info('https://www.cinemax.com/%s.xml' % path, video_id)
info['id'] = video_id
return info

View File

@@ -0,0 +1,151 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import (
clean_html,
float_or_none,
int_or_none,
try_get,
urlencode_postdata,
)
class CiscoLiveBaseIE(InfoExtractor):
# These appear to be constant across all Cisco Live presentations
# and are not tied to any user session or event
RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s'
RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz'
RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye'
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s'
HEADERS = {
'Origin': 'https://ciscolive.cisco.com',
'rfApiProfileId': RAINFOCUS_API_PROFILE_ID,
'rfWidgetId': RAINFOCUS_WIDGET_ID,
}
def _call_api(self, ep, rf_id, query, referrer, note=None):
headers = self.HEADERS.copy()
headers['Referer'] = referrer
return self._download_json(
self.RAINFOCUS_API_URL % ep, rf_id, note=note,
data=urlencode_postdata(query), headers=headers)
def _parse_rf_item(self, rf_item):
event_name = rf_item.get('eventName')
title = rf_item['title']
description = clean_html(rf_item.get('abstract'))
presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName'])
bc_id = rf_item['videos'][0]['url']
bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id
duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length']))
location = try_get(rf_item, lambda x: x['times'][0]['room'])
if duration:
duration = duration * 60
return {
'_type': 'url_transparent',
'url': bc_url,
'ie_key': 'BrightcoveNew',
'title': title,
'description': description,
'duration': duration,
'creator': presenter_name,
'location': location,
'series': event_name,
}
class CiscoLiveSessionIE(CiscoLiveBaseIE):
_VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/[^#]*#/session/(?P<id>[^/?&]+)'
_TESTS = [{
'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs',
'md5': 'c98acf395ed9c9f766941c70f5352e22',
'info_dict': {
'id': '5803694304001',
'ext': 'mp4',
'title': '13 Smart Automations to Monitor Your Cisco IOS Network',
'description': 'md5:ec4a436019e09a918dec17714803f7cc',
'timestamp': 1530305395,
'upload_date': '20180629',
'uploader_id': '5647924234001',
'location': '16B Mezz.',
},
}, {
'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.event=ciscoliveemea2019#/session/15361595531500013WOU',
'only_matching': True,
}, {
'url': 'https://www.ciscolive.com/global/on-demand-library.html?#/session/1490051371645001kNaS',
'only_matching': True,
}]
def _real_extract(self, url):
rf_id = self._match_id(url)
rf_result = self._call_api('session', rf_id, {'id': rf_id}, url)
return self._parse_rf_item(rf_result['items'][0])
class CiscoLiveSearchIE(CiscoLiveBaseIE):
_VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)'
_TESTS = [{
'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/',
'info_dict': {
'title': 'Search query',
},
'playlist_count': 5,
}, {
'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/',
'only_matching': True,
}, {
'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.technicallevel=scpsSkillLevel_aintroductory&search.event=ciscoliveemea2019&search.technology=scpsTechnology_dataCenter&search.focus=scpsSessionFocus_bestPractices#/',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url)
@staticmethod
def _check_bc_id_exists(rf_item):
return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None
def _entries(self, query, url):
query['size'] = 50
query['from'] = 0
for page_num in itertools.count(1):
results = self._call_api(
'search', None, query, url,
'Downloading search JSON page %d' % page_num)
sl = try_get(results, lambda x: x['sectionList'][0], dict)
if sl:
results = sl
items = results.get('items')
if not items or not isinstance(items, list):
break
for item in items:
if not isinstance(item, dict):
continue
if not self._check_bc_id_exists(item):
continue
yield self._parse_rf_item(item)
size = int_or_none(results.get('size'))
if size is not None:
query['size'] = size
total = int_or_none(results.get('total'))
if total is not None and query['from'] + query['size'] > total:
break
query['from'] += query['size']
def _real_extract(self, url):
query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
query['type'] = 'session'
return self.playlist_result(
self._entries(query, url), playlist_title='Search query')

Some files were not shown because too many files have changed in this diff Show More