Telethon/client/src/telethon/_impl/client/parsers/markdown.py
2024-03-17 13:06:03 +01:00

196 lines
7.0 KiB
Python

import re
from collections.abc import Iterator
from typing import Any, Type
import markdown_it
import markdown_it.token
from ...tl.abcs import MessageEntity
from ...tl.types import (
MessageEntityBlockquote,
MessageEntityBold,
MessageEntityCode,
MessageEntityItalic,
MessageEntityMentionName,
MessageEntityPre,
MessageEntityStrike,
MessageEntityTextUrl,
MessageEntityUnderline,
)
from .strings import add_surrogate, del_surrogate, within_surrogate
MARKDOWN = markdown_it.MarkdownIt().enable("strikethrough")
DELIMITERS: dict[Type[MessageEntity], tuple[str, str]] = {
MessageEntityBlockquote: ("> ", ""),
MessageEntityBold: ("**", "**"),
MessageEntityCode: ("`", "`"),
MessageEntityItalic: ("_", "_"),
MessageEntityStrike: ("~~", "~~"),
MessageEntityUnderline: ("# ", ""),
}
# Not trying to be complete; just enough to have an alternative (mostly for inline underline).
# The fact headings are treated as underline is an implementation detail.
TAG_PATTERN = re.compile(r"<\s*(/?)\s*(\w+)")
HTML_TO_TYPE = {
"i": ("em_close", "em_open"),
"em": ("em_close", "em_open"),
"b": ("strong_close", "strong_open"),
"strong": ("strong_close", "strong_open"),
"s": ("s_close", "s_open"),
"del": ("s_close", "s_open"),
"u": ("heading_open", "heading_close"),
"mark": ("heading_open", "heading_close"),
}
def expand_inline_and_html(
tokens: list[markdown_it.token.Token],
) -> Iterator[markdown_it.token.Token]:
for token in tokens:
if token.type == "inline":
if token.children:
yield from expand_inline_and_html(token.children)
elif token.type == "html_inline":
match = TAG_PATTERN.match(token.content)
if match:
close, tag = match.groups()
tys = HTML_TO_TYPE.get(tag.lower())
if tys:
token.type = tys[bool(close)]
token.nesting = -1 if close else 1
yield token
else:
yield token
def parse(message: str) -> tuple[str, list[MessageEntity]]:
"""
Parses the given markdown message and returns its stripped representation
plus a list of the MessageEntity's that were found.
"""
if not message:
return message, []
entities: list[MessageEntity]
token: markdown_it.token.Token
def push(ty: Any, **extra: object) -> None:
nonlocal message, entities, token
if token.nesting > 0:
entities.append(ty(offset=len(message), length=0, **extra))
else:
for entity in reversed(entities):
if isinstance(entity, ty):
setattr(
entity, "length", len(message) - getattr(entity, "offset", 0)
)
break
parsed = MARKDOWN.parse(add_surrogate(message.strip()))
message = ""
entities = []
last_map = [0, 0]
for token in expand_inline_and_html(parsed):
if token.map is not None and token.map != last_map:
# paragraphs, quotes fences have a line mapping. Use it to determine how many newlines to insert.
# But don't inssert any (leading) new lines if we're yet to reach the first textual content, or
# if the mappings are the same (e.g. a quote then opens a paragraph but the mapping is equal).
if message:
message += "\n" + "\n" * (token.map[0] - last_map[-1])
last_map = token.map
if token.type in ("blockquote_close", "blockquote_open"):
push(MessageEntityBlockquote)
elif token.type == "code_block":
entities.append(
MessageEntityPre(
offset=len(message), length=len(token.content), language=""
)
)
message += token.content
elif token.type == "code_inline":
entities.append(
MessageEntityCode(offset=len(message), length=len(token.content))
)
message += token.content
elif token.type in ("em_close", "em_open"):
push(MessageEntityItalic)
elif token.type == "fence":
entities.append(
MessageEntityPre(
offset=len(message), length=len(token.content), language=token.info
)
)
message += token.content[:-1] # remove a single trailing newline
elif token.type == "hardbreak":
message += "\n"
elif token.type in ("heading_close", "heading_open"):
push(MessageEntityUnderline)
elif token.type == "hr":
message += "\u2015\n\n"
elif token.type in ("link_close", "link_open"):
if (
token.markup != "autolink"
): # telegram already picks up on these automatically
push(MessageEntityTextUrl, url=token.attrs.get("href"))
elif token.type in ("s_close", "s_open"):
push(MessageEntityStrike)
elif token.type == "softbreak":
message += "\n"
elif token.type in ("strong_close", "strong_open"):
push(MessageEntityBold)
elif token.type == "text":
message += token.content
return del_surrogate(message), entities
def unparse(text: str, entities: list[MessageEntity]) -> str:
"""
Performs the reverse operation to .parse(), effectively returning
markdown-like syntax given a normal text and its MessageEntity's.
Because there are many possible ways for markdown to produce a certain
output, this function cannot invert .parse() perfectly.
"""
if not text or not entities:
return text
text = add_surrogate(text)
insert_at: list[tuple[int, str]] = []
for e in entities:
offset, length = getattr(e, "offset", None), getattr(e, "length", None)
assert isinstance(offset, int) and isinstance(length, int)
h = offset
t = offset + length
delimiter = DELIMITERS.get(type(e), None)
if delimiter:
insert_at.append((h, delimiter[0]))
insert_at.append((t, delimiter[1]))
elif isinstance(e, MessageEntityPre):
insert_at.append((h, f"```{e.language}\n"))
insert_at.append((t, "```\n"))
elif isinstance(e, MessageEntityTextUrl):
insert_at.append((h, "["))
insert_at.append((t, f"]({e.url})"))
elif isinstance(e, MessageEntityMentionName):
insert_at.append((h, "["))
insert_at.append((t, f"](tg://user?id={e.user_id})"))
insert_at.sort(key=lambda t: t[0])
while insert_at:
at, what = insert_at.pop()
# If we are in the middle of a surrogate nudge the position by -1.
# Otherwise we would end up with malformed text and fail to encode.
# For example of bad input: "Hi \ud83d\ude1c"
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
while within_surrogate(text, at):
at += 1
text = text[:at] + what + text[at:]
return del_surrogate(text)