mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-11-04 08:30:46 +00:00 
			
		
		
		
	Use _download_xml in more extractors
				
					
				
			This commit is contained in:
		@@ -1,5 +1,4 @@
 | 
			
		||||
import re
 | 
			
		||||
import xml.etree.ElementTree
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
from .common import InfoExtractor
 | 
			
		||||
@@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor):
 | 
			
		||||
        uploader_id = mobj.group('company')
 | 
			
		||||
 | 
			
		||||
        playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
 | 
			
		||||
        playlist_snippet = self._download_webpage(playlist_url, movie)
 | 
			
		||||
        playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet)
 | 
			
		||||
        playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned)
 | 
			
		||||
        # The ' in the onClick attributes are not escaped, it couldn't be parsed
 | 
			
		||||
        # with xml.etree.ElementTree.fromstring
 | 
			
		||||
        # like: http://trailers.apple.com/trailers/wb/gravity/
 | 
			
		||||
        def _clean_json(m):
 | 
			
		||||
            return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''')
 | 
			
		||||
        playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned)
 | 
			
		||||
        playlist_html = u'<html>' + playlist_cleaned + u'</html>'
 | 
			
		||||
        def fix_html(s):
 | 
			
		||||
            s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
 | 
			
		||||
            s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
 | 
			
		||||
            # The ' in the onClick attributes are not escaped, it couldn't be parsed
 | 
			
		||||
            # like: http://trailers.apple.com/trailers/wb/gravity/
 | 
			
		||||
            def _clean_json(m):
 | 
			
		||||
                return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''')
 | 
			
		||||
            s = re.sub(self._JSON_RE, _clean_json, s)
 | 
			
		||||
            s = u'<html>' + s + u'</html>'
 | 
			
		||||
            return s
 | 
			
		||||
        doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
 | 
			
		||||
 | 
			
		||||
        doc = xml.etree.ElementTree.fromstring(playlist_html)
 | 
			
		||||
        playlist = []
 | 
			
		||||
        for li in doc.findall('./div/ul/li'):
 | 
			
		||||
            on_click = li.find('.//a').attrib['onClick']
 | 
			
		||||
 
 | 
			
		||||
@@ -1,9 +1,9 @@
 | 
			
		||||
import re
 | 
			
		||||
import xml.etree.ElementTree
 | 
			
		||||
 | 
			
		||||
from .common import InfoExtractor
 | 
			
		||||
from ..utils import (
 | 
			
		||||
    find_xpath_attr,
 | 
			
		||||
    fix_xml_all_ampersand,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor):
 | 
			
		||||
        # it includes a required token
 | 
			
		||||
        flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
 | 
			
		||||
 | 
			
		||||
        playlist_page = self._download_webpage(
 | 
			
		||||
        pdoc = self._download_xml(
 | 
			
		||||
            'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
 | 
			
		||||
            video_id, u'Downloading video info') 
 | 
			
		||||
        # Fix broken xml
 | 
			
		||||
        playlist_page = re.sub('&', '&', playlist_page)
 | 
			
		||||
        pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8'))
 | 
			
		||||
            video_id, u'Downloading video info',
 | 
			
		||||
            transform_source=fix_xml_all_ampersand) 
 | 
			
		||||
 | 
			
		||||
        track_doc = pdoc.find('trackList/track')
 | 
			
		||||
        def find_param(name):
 | 
			
		||||
 
 | 
			
		||||
@@ -1,8 +1,10 @@
 | 
			
		||||
import re
 | 
			
		||||
import xml.etree.ElementTree
 | 
			
		||||
import operator
 | 
			
		||||
 | 
			
		||||
from .common import InfoExtractor
 | 
			
		||||
from ..utils import (
 | 
			
		||||
    fix_xml_all_ampersand,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MetacriticIE(InfoExtractor):
 | 
			
		||||
@@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor):
 | 
			
		||||
        video_id = mobj.group('id')
 | 
			
		||||
        webpage = self._download_webpage(url, video_id)
 | 
			
		||||
        # The xml is not well formatted, there are raw '&'
 | 
			
		||||
        info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id,
 | 
			
		||||
            video_id, u'Downloading info xml').replace('&', '&')
 | 
			
		||||
        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
 | 
			
		||||
        info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
 | 
			
		||||
            video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand)
 | 
			
		||||
 | 
			
		||||
        clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
 | 
			
		||||
        formats = []
 | 
			
		||||
 
 | 
			
		||||
@@ -1057,3 +1057,8 @@ def month_by_name(name):
 | 
			
		||||
        return ENGLISH_NAMES.index(name) + 1
 | 
			
		||||
    except ValueError:
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def fix_xml_all_ampersand(xml_str):
 | 
			
		||||
    """Replace all the '&' by '&' in XML"""
 | 
			
		||||
    return xml_str.replace(u'&', u'&')
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user