[extractor] Add _search_nuxt_data (#1921)

Authored by: nao20010128nao
This commit is contained in:
The Hatsune Daishi
2021-12-08 01:38:50 +09:00
committed by GitHub
parent 93864403ea
commit 66f4c04e50
2 changed files with 19 additions and 12 deletions

View File

@@ -1513,6 +1513,24 @@ class InfoExtractor(object):
webpage, 'next.js data', **kw),
video_id, **kw)
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
# not all website do this, but it can be changed
# https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
rectx = re.escape(context_name)
js, arg_keys, arg_vals = self._search_regex(
(r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
for key, val in args.items():
if val in ('undefined', 'void 0'):
args[key] = 'null'
return self._parse_json(js_to_json(js, args), video_id)['data'][0]
@staticmethod
def _hidden_inputs(html):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)