Add support for unparsing nested entities into HTML (#1209)

This commit is contained in:
Tulir Asokan 2019-06-24 13:28:14 +03:00 committed by Lonami
parent 962949008f
commit 8b28f4ffbf

View File

@ -5,13 +5,15 @@ import struct
from collections import deque from collections import deque
from html import escape, unescape from html import escape, unescape
from html.parser import HTMLParser from html.parser import HTMLParser
from typing import Iterable, Optional, Tuple, List
from .. import helpers from .. import helpers
from ..tl.types import ( from ..tl.types import (
MessageEntityBold, MessageEntityItalic, MessageEntityCode, MessageEntityBold, MessageEntityItalic, MessageEntityCode,
MessageEntityPre, MessageEntityEmail, MessageEntityUrl, MessageEntityPre, MessageEntityEmail, MessageEntityUrl,
MessageEntityTextUrl, MessageEntityMentionName, MessageEntityTextUrl, MessageEntityMentionName,
MessageEntityUnderline, MessageEntityStrike, MessageEntityBlockquote MessageEntityUnderline, MessageEntityStrike, MessageEntityBlockquote,
TypeMessageEntity
) )
@ -121,7 +123,7 @@ class HTMLToTelegramParser(HTMLParser):
self.entities.append(entity) self.entities.append(entity)
def parse(html): def parse(html: str) -> Tuple[str, List[TypeMessageEntity]]:
""" """
Parses the given HTML message and returns its stripped representation Parses the given HTML message and returns its stripped representation
plus a list of the MessageEntity's that were found. plus a list of the MessageEntity's that were found.
@ -138,7 +140,8 @@ def parse(html):
return _del_surrogate(text), parser.entities return _del_surrogate(text), parser.entities
def unparse(text, entities): def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
_length: Optional[int] = None) -> str:
""" """
Performs the reverse operation to .parse(), effectively returning HTML Performs the reverse operation to .parse(), effectively returning HTML
given a normal text and its MessageEntity's. given a normal text and its MessageEntity's.
@ -147,20 +150,29 @@ def unparse(text, entities):
:param entities: the MessageEntity's applied to the text. :param entities: the MessageEntity's applied to the text.
:return: a HTML representation of the combination of both inputs. :return: a HTML representation of the combination of both inputs.
""" """
if not text or not entities: if not text:
return text return text
elif not entities:
return escape(text)
text = _add_surrogate(text) text = _add_surrogate(text)
if _length is None:
_length = len(text)
html = [] html = []
last_offset = 0 last_offset = 0
for entity in entities: for i, entity in enumerate(entities):
if entity.offset > last_offset: if entity.offset > _offset + _length:
html.append(escape(text[last_offset:entity.offset])) break
elif entity.offset < last_offset: relative_offset = entity.offset - _offset
if relative_offset > last_offset:
html.append(escape(text[last_offset:relative_offset]))
elif relative_offset < last_offset:
continue continue
skip_entity = False skip_entity = False
entity_text = escape(text[entity.offset:entity.offset + entity.length]) entity_text = unparse(text=text[relative_offset:relative_offset + entity.length],
entities=entities[i + 1:],
_offset=entity.offset, _length=entity.length)
entity_type = type(entity) entity_type = type(entity)
if entity_type == MessageEntityBold: if entity_type == MessageEntityBold:
@ -198,6 +210,6 @@ def unparse(text, entities):
.format(entity.user_id, entity_text)) .format(entity.user_id, entity_text))
else: else:
skip_entity = True skip_entity = True
last_offset = entity.offset + (0 if skip_entity else entity.length) last_offset = relative_offset + (0 if skip_entity else entity.length)
html.append(text[last_offset:]) html.append(escape(text[last_offset:]))
return _del_surrogate(''.join(html)) return _del_surrogate(''.join(html))