From f5fafc6a27bd782a1c6e360c49bdb1183db84e98 Mon Sep 17 00:00:00 2001 From: Lonami Exo Date: Sun, 29 Oct 2017 16:41:30 +0100 Subject: [PATCH] Enhance emoji detection --- telethon/extensions/markdown.py | 29 ++++++--- telethon_generator/emoji_ranges.py | 101 +++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 9 deletions(-) create mode 100644 telethon_generator/emoji_ranges.py diff --git a/telethon/extensions/markdown.py b/telethon/extensions/markdown.py index 90ab9d99..fa33aace 100644 --- a/telethon/extensions/markdown.py +++ b/telethon/extensions/markdown.py @@ -22,19 +22,30 @@ class Mode(Enum): URL = 5 -EMOJI_PATTERN = re.compile( - '[' - '\U0001F600-\U0001F64F' # emoticons - '\U0001F300-\U0001F5FF' # symbols & pictographs - '\U0001F680-\U0001F6FF' # transport & map symbols - '\U0001F1E0-\U0001F1FF' # flags (iOS) - ']+', flags=re.UNICODE +# using telethon_generator/emoji_ranges.py +EMOJI_RANGES = ( + (8596, 8601), (8617, 8618), (8986, 8987), (9193, 9203), (9208, 9210), + (9642, 9643), (9723, 9726), (9728, 9733), (9735, 9746), (9748, 9751), + (9754, 9884), (9886, 9905), (9907, 9953), (9956, 9983), (9985, 9988), + (9992, 10002), (10035, 10036), (10067, 10069), (10083, 10087), + (10133, 10135), (10548, 10549), (11013, 11015), (11035, 11036), + (126976, 127166), (127169, 127183), (127185, 127231), (127245, 127247), + (127340, 127345), (127358, 127359), (127377, 127386), (127405, 127487), + (127489, 127503), (127538, 127546), (127548, 127551), (127561, 128419), + (128421, 128591), (128640, 128767), (128884, 128895), (128981, 129023), + (129036, 129039), (129096, 129103), (129114, 129119), (129160, 129167), + (129198, 129338), (129340, 129342), (129344, 129349), (129351, 129355), + (129357, 129471), (129473, 131069) ) def is_emoji(char): """Returns True if 'char' looks like an emoji""" - return bool(EMOJI_PATTERN.match(char)) + char = ord(char) + for start, end in EMOJI_RANGES: + if start <= char <= end: + return True + return False def emojiness(char): @@ -44,7 +55,7 @@ def emojiness(char): """ if not is_emoji(char): return 1 - if ord(char) < ord('🤐'): + if ord(char) < 129296: return 2 else: return 3 diff --git a/telethon_generator/emoji_ranges.py b/telethon_generator/emoji_ranges.py new file mode 100644 index 00000000..90597cf6 --- /dev/null +++ b/telethon_generator/emoji_ranges.py @@ -0,0 +1,101 @@ +""" +Simple module to allow fetching unicode.org emoji lists and printing a +Python-like tuple out of them. + +May not be accurate 100%, and is definitely not as efficient as it could be, +but it should only be ran whenever the Unicode consortium decides to add +new emojies to the list. +""" +import os +import sys +import re +import urllib.error +import urllib.request + + +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + + +def get(url, enc='utf-8'): + try: + with urllib.request.urlopen(url) as f: + return f.read().decode(enc, errors='replace') + except urllib.error.HTTPError as e: + eprint('Caught', e, 'for', url, '; returning empty') + return '' + + +PREFIX_URL = 'http://unicode.org/Public/emoji/' +SUFFIX_URL = '/emoji-data.txt', '/emoji-sequences.txt' +VERSION_RE = re.compile(r'>(\d+.\d+)/<') +OUTPUT_TXT = 'emojies.txt' +CODEPOINT_RE = re.compile(r'([\da-fA-F]{3,}(?:[\s.]+[\da-fA-F]{3,}))') +EMOJI_START = 0x20e3 # emoji data has many more ranges, falling outside this +EMOJI_END = 200000 # from some tests those outside the range aren't emojies + + +versions = VERSION_RE.findall(get(PREFIX_URL)) +lines = [] +if not os.path.isfile(OUTPUT_TXT): + with open(OUTPUT_TXT, 'w') as f: + for version in versions: + for s in SUFFIX_URL: + url = PREFIX_URL + version + s + for line in get(url).split('\n'): + line = line.strip() + if not line or line.startswith('#'): + continue + m = CODEPOINT_RE.search(line) + if m and m.start() == 0: + f.write(m.group(1) + '\n') + + +points = set() +with open(OUTPUT_TXT) as f: + for line in f: + line = line.strip() + if ' ' in line: + for p in line.split(): + i = int(p, 16) + if i > 255: + points.add(i) + elif '.' in line: + s, e = line.split('..') + for i in range(int(s, 16), int(e, 16) + 1): + if i > 255: + points.add(i) + else: + i = int(line, 16) + if i > 255: + points.add(int(line, 16)) + + +ranges = [] +points = tuple(sorted(points)) +start = points[0] +last = start +for point in points: + if point - last > 1: + if start == last or not (EMOJI_START < start < EMOJI_END): + eprint( + 'Dropping', last - start + 1, + 'character(s) from', hex(start), ':', chr(start) + ) + else: + ranges.append((start, last)) + start = point + + last = point + + +if start == last or not (EMOJI_START < start < EMOJI_END): + eprint( + 'Dropping', last - start + 1, + 'character(s) from', hex(start), ':', chr(start) + ) +else: + ranges.append((start, last)) + + +print('EMOJI_RANGES = ({})'.format(', '.join(repr(r) for r in ranges)))