Enhance emoji detection

2025-07-22 05:49:45 +03:00 · 2017-10-29 16:41:30 +01:00 · 2017-10-29 16:41:30 +01:00 · f5fafc6a27
commit f5fafc6a27
parent 368269cb11
2 changed files with 121 additions and 9 deletions
--- a/telethon/extensions/markdown.py
+++ b/telethon/extensions/markdown.py
@ -22,19 +22,30 @@ class Mode(Enum):
    URL = 5


-EMOJI_PATTERN = re.compile(
-    '['
-    '\U0001F600-\U0001F64F'  # emoticons
-    '\U0001F300-\U0001F5FF'  # symbols & pictographs
-    '\U0001F680-\U0001F6FF'  # transport & map symbols
-    '\U0001F1E0-\U0001F1FF'  # flags (iOS)
-    ']+', flags=re.UNICODE
+# using telethon_generator/emoji_ranges.py
+EMOJI_RANGES = (
+    (8596, 8601), (8617, 8618), (8986, 8987), (9193, 9203), (9208, 9210),
+    (9642, 9643), (9723, 9726), (9728, 9733), (9735, 9746), (9748, 9751),
+    (9754, 9884), (9886, 9905), (9907, 9953), (9956, 9983), (9985, 9988),
+    (9992, 10002), (10035, 10036), (10067, 10069), (10083, 10087),
+    (10133, 10135), (10548, 10549), (11013, 11015), (11035, 11036),
+    (126976, 127166), (127169, 127183), (127185, 127231), (127245, 127247),
+    (127340, 127345), (127358, 127359), (127377, 127386), (127405, 127487),
+    (127489, 127503), (127538, 127546), (127548, 127551), (127561, 128419),
+    (128421, 128591), (128640, 128767), (128884, 128895), (128981, 129023),
+    (129036, 129039), (129096, 129103), (129114, 129119), (129160, 129167),
+    (129198, 129338), (129340, 129342), (129344, 129349), (129351, 129355),
+    (129357, 129471), (129473, 131069)
 )


 def is_emoji(char):
    """Returns True if 'char' looks like an emoji"""
-    return bool(EMOJI_PATTERN.match(char))
+    char = ord(char)
+    for start, end in EMOJI_RANGES:
+        if start <= char <= end:
+            return True
+    return False


 def emojiness(char):
@ -44,7 +55,7 @@ def emojiness(char):
    """
    if not is_emoji(char):
        return 1
-    if ord(char) < ord('🤐'):
+    if ord(char) < 129296:
        return 2
    else:
        return 3
--- a/telethon_generator/emoji_ranges.py
+++ b/telethon_generator/emoji_ranges.py
@ -0,0 +1,101 @@
+"""
+Simple module to allow fetching unicode.org emoji lists and printing a
+Python-like tuple out of them.
+
+May not be accurate 100%, and is definitely not as efficient as it could be,
+but it should only be ran whenever the Unicode consortium decides to add
+new emojies to the list.
+"""
+import os
+import sys
+import re
+import urllib.error
+import urllib.request
+
+
+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+
+
+def get(url, enc='utf-8'):
+    try:
+        with urllib.request.urlopen(url) as f:
+            return f.read().decode(enc, errors='replace')
+    except urllib.error.HTTPError as e:
+        eprint('Caught', e, 'for', url, '; returning empty')
+        return ''
+
+
+PREFIX_URL = 'http://unicode.org/Public/emoji/'
+SUFFIX_URL = '/emoji-data.txt', '/emoji-sequences.txt'
+VERSION_RE = re.compile(r'>(\d+.\d+)/<')
+OUTPUT_TXT = 'emojies.txt'
+CODEPOINT_RE = re.compile(r'([\da-fA-F]{3,}(?:[\s.]+[\da-fA-F]{3,}))')
+EMOJI_START = 0x20e3  # emoji data has many more ranges, falling outside this
+EMOJI_END = 200000  # from some tests those outside the range aren't emojies
+
+
+versions = VERSION_RE.findall(get(PREFIX_URL))
+lines = []
+if not os.path.isfile(OUTPUT_TXT):
+    with open(OUTPUT_TXT, 'w') as f:
+        for version in versions:
+            for s in SUFFIX_URL:
+                url = PREFIX_URL + version + s
+                for line in get(url).split('\n'):
+                    line = line.strip()
+                    if not line or line.startswith('#'):
+                        continue
+                    m = CODEPOINT_RE.search(line)
+                    if m and m.start() == 0:
+                        f.write(m.group(1) + '\n')
+
+
+points = set()
+with open(OUTPUT_TXT) as f:
+    for line in f:
+        line = line.strip()
+        if ' ' in line:
+            for p in line.split():
+                i = int(p, 16)
+                if i > 255:
+                    points.add(i)
+        elif '.' in line:
+            s, e = line.split('..')
+            for i in range(int(s, 16), int(e, 16) + 1):
+                if i > 255:
+                    points.add(i)
+        else:
+            i = int(line, 16)
+            if i > 255:
+                points.add(int(line, 16))
+
+
+ranges = []
+points = tuple(sorted(points))
+start = points[0]
+last = start
+for point in points:
+    if point - last > 1:
+        if start == last or not (EMOJI_START < start < EMOJI_END):
+            eprint(
+                'Dropping', last - start + 1,
+                'character(s) from', hex(start), ':', chr(start)
+            )
+        else:
+            ranges.append((start, last))
+        start = point
+
+    last = point
+
+
+if start == last or not (EMOJI_START < start < EMOJI_END):
+    eprint(
+        'Dropping', last - start + 1,
+        'character(s) from', hex(start), ':', chr(start)
+    )
+else:
+    ranges.append((start, last))
+
+
+print('EMOJI_RANGES = ({})'.format(', '.join(repr(r) for r in ranges)))