diff --git a/telethon/extensions/html.py b/telethon/extensions/html.py
index 9732b615..62e622ba 100644
--- a/telethon/extensions/html.py
+++ b/telethon/extensions/html.py
@@ -174,12 +174,10 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
# Otherwise we would end up with malformed text and fail to encode.
# For example of bad input: "Hi \ud83d\ude1c"
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
- while (relative_offset < _length
- and '\ud800' <= text[relative_offset] <= '\udfff'):
+ while helpers.within_surrogate(text, relative_offset, length=_length):
relative_offset += 1
- while (relative_offset + length < _length
- and '\ud800' <= text[relative_offset + length] <= '\udfff'):
+ while helpers.within_surrogate(text, relative_offset + length, length=_length):
length += 1
entity_text = unparse(text=text[relative_offset:relative_offset + length],
@@ -224,7 +222,7 @@ def unparse(text: str, entities: Iterable[TypeMessageEntity], _offset: int = 0,
skip_entity = True
last_offset = relative_offset + (0 if skip_entity else length)
- while last_offset < _length and '\ud800' <= text[last_offset] <= '\udfff':
+ while helpers.within_surrogate(text, last_offset, length=_length):
last_offset += 1
html.append(escape(text[last_offset:]))
diff --git a/telethon/extensions/markdown.py b/telethon/extensions/markdown.py
index 480d633d..f6d59106 100644
--- a/telethon/extensions/markdown.py
+++ b/telethon/extensions/markdown.py
@@ -6,7 +6,7 @@ since they seem to count as two characters and it's a bit strange.
import re
import warnings
-from ..helpers import add_surrogate, del_surrogate, strip_text
+from ..helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
from ..tl import TLObject
from ..tl.types import (
MessageEntityBold, MessageEntityItalic, MessageEntityCode,
@@ -185,11 +185,11 @@ def unparse(text, entities, delimiters=None, url_fmt=None):
while insert_at:
at, what = insert_at.pop()
- # If we are in the middle of a surrogate nudge the position by +1.
+ # If we are in the middle of a surrogate nudge the position by -1.
# Otherwise we would end up with malformed text and fail to encode.
# For example of bad input: "Hi \ud83d\ude1c"
# https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
- while at < len(text) and '\ud800' <= text[at] <= '\udfff':
+ while within_surrogate(text, at):
at += 1
text = text[:at] + what + text[at:]
diff --git a/telethon/helpers.py b/telethon/helpers.py
index 4c8d4799..1b5f8843 100644
--- a/telethon/helpers.py
+++ b/telethon/helpers.py
@@ -40,6 +40,20 @@ def del_surrogate(text):
return text.encode('utf-16', 'surrogatepass').decode('utf-16')
+def within_surrogate(text, index, *, length=None):
+ """
+ `True` if ``index`` is within a surrogate (before and after it, not at!).
+ """
+ if length is None:
+ length = len(text)
+
+ return (
+ 1 < index < len(text) and # in bounds
+ '\ud800' <= text[index - 1] <= '\udfff' and # previous is
+ '\ud800' <= text[index] <= '\udfff' # current is
+ )
+
+
def strip_text(text, entities):
"""
Strips whitespace from the given text modifying the provided entities.
diff --git a/tests/telethon/extensions/test_html.py b/tests/telethon/extensions/test_html.py
index ee497321..59d96e0d 100644
--- a/tests/telethon/extensions/test_html.py
+++ b/tests/telethon/extensions/test_html.py
@@ -23,7 +23,7 @@ def test_malformed_entities():
text = '๐Telegram Official Android Challenge is over๐.'
entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
result = html.unparse(text, entities)
- assert result == '๐Telegram Official Android Challenge is over๐.'
+ assert result == '๐Telegram Official Android Challenge is over๐.'
def test_trailing_malformed_entities():
@@ -35,7 +35,7 @@ def test_trailing_malformed_entities():
text = '๐Telegram Official Android Challenge is over๐'
entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
result = html.unparse(text, entities)
- assert result == '๐Telegram Official Android Challenge is over๐'
+ assert result == '๐Telegram Official Android Challenge is over๐'
def test_entities_together():
@@ -51,3 +51,15 @@ def test_entities_together():
text = html.unparse(text, entities)
assert text == original
+
+
+def test_offset_at_emoji():
+ """
+ Tests that an entity starting at a emoji preserves the emoji.
+ """
+ text = 'Hi\n๐ See example'
+ entities = [MessageEntityBold(0, 2), MessageEntityItalic(3, 2), MessageEntityBold(10, 7)]
+ parsed = 'Hi\n๐ See example'
+
+ assert html.parse(parsed) == (text, entities)
+ assert html.unparse(text, entities) == parsed
diff --git a/tests/telethon/extensions/test_markdown.py b/tests/telethon/extensions/test_markdown.py
index 2f263644..bd78e4d8 100644
--- a/tests/telethon/extensions/test_markdown.py
+++ b/tests/telethon/extensions/test_markdown.py
@@ -23,7 +23,7 @@ def test_malformed_entities():
text = '๐Telegram Official Android Challenge is over๐.'
entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
result = markdown.unparse(text, entities)
- assert result == "๐[Telegram Official Android Challenge is over๐](https://example.com)."
+ assert result == "๐[Telegram Official Android Challenge is over](https://example.com)๐."
def test_trailing_malformed_entities():
@@ -35,7 +35,7 @@ def test_trailing_malformed_entities():
text = '๐Telegram Official Android Challenge is over๐'
entities = [MessageEntityTextUrl(offset=2, length=43, url='https://example.com')]
result = markdown.unparse(text, entities)
- assert result == "๐[Telegram Official Android Challenge is over๐](https://example.com)"
+ assert result == "๐[Telegram Official Android Challenge is over](https://example.com)๐"
def test_entities_together():
@@ -51,3 +51,15 @@ def test_entities_together():
text = markdown.unparse(text, entities)
assert text == original
+
+
+def test_offset_at_emoji():
+ """
+ Tests that an entity starting at a emoji preserves the emoji.
+ """
+ text = 'Hi\n๐ See example'
+ entities = [MessageEntityBold(0, 2), MessageEntityItalic(3, 2), MessageEntityBold(10, 7)]
+ parsed = '**Hi**\n__๐__ See **example**'
+
+ assert markdown.parse(parsed) == (text, entities)
+ assert markdown.unparse(text, entities) == parsed