Fix within surrogate detection

2025-08-08 12:59:46 +00:00 · 2020-02-20 10:53:28 +01:00
parent 3a6c955c90
commit 3d32e16235
5 changed files with 48 additions and 12 deletions
--- a/telethon/extensions/markdown.py
+++ b/telethon/extensions/markdown.py
@@ -6,7 +6,7 @@ since they seem to count as two characters and it's a bit strange.
 import re
 import warnings

-from ..helpers import add_surrogate, del_surrogate, strip_text
+from ..helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
 from ..tl import TLObject
 from ..tl.types import (
    MessageEntityBold, MessageEntityItalic, MessageEntityCode,
@@ -185,11 +185,11 @@ def unparse(text, entities, delimiters=None, url_fmt=None):
    while insert_at:
        at, what = insert_at.pop()

-        # If we are in the middle of a surrogate nudge the position by +1.
+        # If we are in the middle of a surrogate nudge the position by -1.
        # Otherwise we would end up with malformed text and fail to encode.
        # For example of bad input: "Hi \ud83d\ude1c"
        # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
-        while at < len(text) and '\ud800' <= text[at] <= '\udfff':
+        while within_surrogate(text, at):
            at += 1

        text = text[:at] + what + text[at:]