[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances
This commit is contained in:
pukkandan
2022-05-11 21:24:44 +05:30
parent 7ddbf09c25
commit 82d020804d
11 changed files with 188 additions and 167 deletions

View File

@@ -37,11 +37,17 @@ def gen_extractors():
return [klass() for klass in gen_extractor_classes()]
def list_extractors(age_limit):
def list_extractor_classes(age_limit=None):
"""Return a list of extractors that are suitable for the given age, sorted by extractor name"""
return sorted(filter(
lambda ie: ie.is_suitable(age_limit),
gen_extractors()), key=lambda ie: ie.IE_NAME.lower())
yield from sorted(filter(
lambda ie: ie.is_suitable(age_limit) and ie != GenericIE, # noqa: F405
gen_extractor_classes()), key=lambda ie: ie.IE_NAME.lower())
yield GenericIE # noqa: F405
def list_extractors(age_limit=None):
"""Return a list of extractor instances that are suitable for the given age, sorted by extractor name"""
return [ie() for ie in list_extractor_classes(age_limit)]
def get_info_extractor(ie_name):

View File

@@ -40,6 +40,7 @@ from ..utils import (
age_restricted,
base_url,
bug_reports_message,
classproperty,
clean_html,
determine_ext,
determine_protocol,
@@ -710,9 +711,9 @@ class InfoExtractor:
"""A string for getting the InfoExtractor with get_info_extractor"""
return cls.__name__[:-2]
@property
def IE_NAME(self):
return type(self).__name__[:-2]
@classproperty
def IE_NAME(cls):
return cls.__name__[:-2]
@staticmethod
def __can_accept_status_code(err, expected_status):
@@ -3624,56 +3625,57 @@ class InfoExtractor:
self._set_cookie(domain, cookie, value)
break
def get_testcases(self, include_onlymatching=False):
t = getattr(self, '_TEST', None)
@classmethod
def get_testcases(cls, include_onlymatching=False):
t = getattr(cls, '_TEST', None)
if t:
assert not hasattr(self, '_TESTS'), \
'%s has _TEST and _TESTS' % type(self).__name__
assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
tests = [t]
else:
tests = getattr(self, '_TESTS', [])
tests = getattr(cls, '_TESTS', [])
for t in tests:
if not include_onlymatching and t.get('only_matching', False):
continue
t['name'] = type(self).__name__[:-len('IE')]
t['name'] = cls.ie_key()
yield t
def is_suitable(self, age_limit):
@classmethod
def is_suitable(cls, age_limit):
""" Test whether the extractor is generally suitable for the given
age limit (i.e. pornographic sites are not, all others usually are) """
any_restricted = False
for tc in self.get_testcases(include_onlymatching=False):
for tc in cls.get_testcases(include_onlymatching=False):
if tc.get('playlist', []):
tc = tc['playlist'][0]
is_restricted = age_restricted(
tc.get('info_dict', {}).get('age_limit'), age_limit)
is_restricted = age_restricted(tc.get('info_dict', {}).get('age_limit'), age_limit)
if not is_restricted:
return True
any_restricted = any_restricted or is_restricted
return not any_restricted
def description(self, *, markdown=True, search_examples=None):
@classmethod
def description(cls, *, markdown=True, search_examples=None):
"""Description of the extractor"""
desc = ''
if self._NETRC_MACHINE:
if cls._NETRC_MACHINE:
if markdown:
desc += f' [<abbr title="netrc machine"><em>{self._NETRC_MACHINE}</em></abbr>]'
desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
else:
desc += f' [{self._NETRC_MACHINE}]'
if self.IE_DESC is False:
desc += f' [{cls._NETRC_MACHINE}]'
if cls.IE_DESC is False:
desc += ' [HIDDEN]'
elif self.IE_DESC:
desc += f' {self.IE_DESC}'
if self.SEARCH_KEY:
desc += f'; "{self.SEARCH_KEY}:" prefix'
elif cls.IE_DESC:
desc += f' {cls.IE_DESC}'
if cls.SEARCH_KEY:
desc += f'; "{cls.SEARCH_KEY}:" prefix'
if search_examples:
_COUNTS = ('', '5', '10', 'all')
desc += f' (Example: "{self.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
if not self.working():
desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
if not cls.working():
desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
name = f' - **{self.IE_NAME}**' if markdown else self.IE_NAME
name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
return f'{name}:{desc}' if desc else name
def extract_subtitles(self, *args, **kwargs):
@@ -3849,6 +3851,6 @@ class SearchInfoExtractor(InfoExtractor):
"""Returns an iterator of search results"""
raise NotImplementedError('This method must be implemented by subclasses')
@property
def SEARCH_KEY(self):
return self._SEARCH_KEY
@classproperty
def SEARCH_KEY(cls):
return cls._SEARCH_KEY

View File

@@ -18,6 +18,7 @@ from ..utils import (
url_or_none,
)
class DRTVIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://

View File

@@ -8,55 +8,36 @@ class TestURLIE(InfoExtractor):
""" Allows addressing of the test cases as test:yout.*be_1 """
IE_DESC = False # Do not list
_VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$'
_VALID_URL = r'test(?:url)?:(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?$'
def _real_extract(self, url):
from ..extractor import gen_extractors
from ..extractor import gen_extractor_classes
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
extractor_id = mobj.group('extractor')
all_extractors = gen_extractors()
extractor_id, num = self._match_valid_url(url).group('extractor', 'num')
rex = re.compile(extractor_id, flags=re.IGNORECASE)
matching_extractors = [
e for e in all_extractors if rex.search(e.IE_NAME)]
matching_extractors = [e for e in gen_extractor_classes() if rex.search(e.IE_NAME)]
if len(matching_extractors) == 0:
raise ExtractorError(
'No extractors matching %r found' % extractor_id,
expected=True)
raise ExtractorError('No extractors matching {extractor_id!r} found', expected=True)
elif len(matching_extractors) > 1:
# Is it obvious which one to pick?
try:
try: # Check for exact match
extractor = next(
ie for ie in matching_extractors
if ie.IE_NAME.lower() == extractor_id.lower())
except StopIteration:
raise ExtractorError(
('Found multiple matching extractors: %s' %
' '.join(ie.IE_NAME for ie in matching_extractors)),
'Found multiple matching extractors: %s' % ' '.join(ie.IE_NAME for ie in matching_extractors),
expected=True)
else:
extractor = matching_extractors[0]
num_str = mobj.group('num')
num = int(num_str) if num_str else 0
testcases = []
t = getattr(extractor, '_TEST', None)
if t:
testcases.append(t)
testcases.extend(getattr(extractor, '_TESTS', []))
testcases = tuple(extractor.get_testcases(True))
try:
tc = testcases[num]
tc = testcases[int(num or 0)]
except IndexError:
raise ExtractorError(
('Test case %d not found, got only %d tests' %
(num, len(testcases))),
expected=True)
f'Test case {num or 0} not found, got only {len(testcases)} tests', expected=True)
self.to_screen('Test URL: %s' % tc['url'])
return self.url_result(tc['url'], video_id=video_id)
self.to_screen(f'Test URL: {tc["url"]}')
return self.url_result(tc['url'])

View File

@@ -31,6 +31,7 @@ from ..utils import (
NO_DEFAULT,
ExtractorError,
bug_reports_message,
classproperty,
clean_html,
datetime_from_str,
dict_get,
@@ -5781,16 +5782,17 @@ class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor):
class YoutubeFeedsInfoExtractor(InfoExtractor):
"""
Base class for feed extractors
Subclasses must define the _FEED_NAME property.
Subclasses must re-define the _FEED_NAME property.
"""
_LOGIN_REQUIRED = True
_FEED_NAME = 'feeds'
def _real_initialize(self):
YoutubeBaseInfoExtractor._check_login_required(self)
@property
@classproperty
def IE_NAME(self):
return 'youtube:%s' % self._FEED_NAME
return f'youtube:{self._FEED_NAME}'
def _real_extract(self, url):
return self.url_result(