Fix HTML/MD parser producing empty MessageEntity (#3885)

Closes #3884. The implementation is also simplified.
This commit is contained in:
Rongrong
2022-07-25 17:11:26 +08:00
committed by GitHub
parent 066820900d
commit 046e2cb605
2 changed files with 86 additions and 29 deletions

View File

@@ -64,40 +64,74 @@ def within_surrogate(text, index, *, length=None):
def strip_text(text, entities):
"""
Strips whitespace from the given text modifying the provided entities.
Strips whitespace from the given surrogated text modifying the provided
entities, also removing any empty (0-length) entities.
This assumes that there are no overlapping entities, that their length
is greater or equal to one, and that their length is not out of bounds.
This assumes that the length of entities is greater or equal to 0, and
that no entity is out of bounds.
"""
if not entities:
return text.strip()
while text and text[-1].isspace():
e = entities[-1]
if e.offset + e.length == len(text):
if e.length == 1:
del entities[-1]
if not entities:
return text.strip()
len_ori = len(text)
text = text.lstrip()
left_offset = len_ori - len(text)
text = text.rstrip()
len_final = len(text)
for i in reversed(range(len(entities))):
e = entities[i]
if e.length == 0:
del entities[i]
continue
if e.offset + e.length > left_offset:
if e.offset >= left_offset:
# 0 1|2 3 4 5 | 0 1|2 3 4 5
# ^ ^ | ^
# lo(2) o(5) | o(2)/lo(2)
e.offset -= left_offset
# |0 1 2 3 | |0 1 2 3
# ^ | ^
# o=o-lo(3=5-2) | o=o-lo(0=2-2)
else:
e.length -= 1
text = text[:-1]
# e.offset < left_offset and e.offset + e.length > left_offset
# 0 1 2 3|4 5 6 7 8 9 10
# ^ ^ ^
# o(1) lo(4) o+l(1+9)
e.length = e.offset + e.length - left_offset
e.offset = 0
# |0 1 2 3 4 5 6
# ^ ^
# o(0) o+l=0+o+l-lo(6=0+6=0+1+9-4)
else:
# e.offset + e.length <= left_offset
# 0 1 2 3|4 5
# ^ ^
# o(0) o+l(4)
# lo(4)
del entities[i]
continue
while text and text[0].isspace():
for i in reversed(range(len(entities))):
e = entities[i]
if e.offset != 0:
e.offset -= 1
continue
if e.length == 1:
del entities[0]
if not entities:
return text.lstrip()
else:
e.length -= 1
text = text[1:]
if e.offset + e.length <= len_final:
# |0 1 2 3 4 5 6 7 8 9
# ^ ^
# o(1) o+l(1+9)/lf(10)
continue
if e.offset >= len_final:
# |0 1 2 3 4
# ^
# o(5)/lf(5)
del entities[i]
else:
# e.offset < len_final and e.offset + e.length > len_final
# |0 1 2 3 4 5 (6) (7) (8) (9)
# ^ ^ ^
# o(1) lf(6) o+l(1+8)
e.length = len_final - e.offset
# |0 1 2 3 4 5
# ^ ^
# o(1) o+l=o+lf-o=lf(6=1+5=1+6-1)
return text