mirror of
https://github.com/LonamiWebs/Telethon.git
synced 2025-02-09 08:00:53 +03:00
Enhance emoji detection
This commit is contained in:
parent
368269cb11
commit
f5fafc6a27
|
@ -22,19 +22,30 @@ class Mode(Enum):
|
||||||
URL = 5
|
URL = 5
|
||||||
|
|
||||||
|
|
||||||
EMOJI_PATTERN = re.compile(
|
# using telethon_generator/emoji_ranges.py
|
||||||
'['
|
EMOJI_RANGES = (
|
||||||
'\U0001F600-\U0001F64F' # emoticons
|
(8596, 8601), (8617, 8618), (8986, 8987), (9193, 9203), (9208, 9210),
|
||||||
'\U0001F300-\U0001F5FF' # symbols & pictographs
|
(9642, 9643), (9723, 9726), (9728, 9733), (9735, 9746), (9748, 9751),
|
||||||
'\U0001F680-\U0001F6FF' # transport & map symbols
|
(9754, 9884), (9886, 9905), (9907, 9953), (9956, 9983), (9985, 9988),
|
||||||
'\U0001F1E0-\U0001F1FF' # flags (iOS)
|
(9992, 10002), (10035, 10036), (10067, 10069), (10083, 10087),
|
||||||
']+', flags=re.UNICODE
|
(10133, 10135), (10548, 10549), (11013, 11015), (11035, 11036),
|
||||||
|
(126976, 127166), (127169, 127183), (127185, 127231), (127245, 127247),
|
||||||
|
(127340, 127345), (127358, 127359), (127377, 127386), (127405, 127487),
|
||||||
|
(127489, 127503), (127538, 127546), (127548, 127551), (127561, 128419),
|
||||||
|
(128421, 128591), (128640, 128767), (128884, 128895), (128981, 129023),
|
||||||
|
(129036, 129039), (129096, 129103), (129114, 129119), (129160, 129167),
|
||||||
|
(129198, 129338), (129340, 129342), (129344, 129349), (129351, 129355),
|
||||||
|
(129357, 129471), (129473, 131069)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_emoji(char):
|
def is_emoji(char):
|
||||||
"""Returns True if 'char' looks like an emoji"""
|
"""Returns True if 'char' looks like an emoji"""
|
||||||
return bool(EMOJI_PATTERN.match(char))
|
char = ord(char)
|
||||||
|
for start, end in EMOJI_RANGES:
|
||||||
|
if start <= char <= end:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def emojiness(char):
|
def emojiness(char):
|
||||||
|
@ -44,7 +55,7 @@ def emojiness(char):
|
||||||
"""
|
"""
|
||||||
if not is_emoji(char):
|
if not is_emoji(char):
|
||||||
return 1
|
return 1
|
||||||
if ord(char) < ord('🤐'):
|
if ord(char) < 129296:
|
||||||
return 2
|
return 2
|
||||||
else:
|
else:
|
||||||
return 3
|
return 3
|
||||||
|
|
101
telethon_generator/emoji_ranges.py
Normal file
101
telethon_generator/emoji_ranges.py
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
"""
|
||||||
|
Simple module to allow fetching unicode.org emoji lists and printing a
|
||||||
|
Python-like tuple out of them.
|
||||||
|
|
||||||
|
May not be accurate 100%, and is definitely not as efficient as it could be,
|
||||||
|
but it should only be ran whenever the Unicode consortium decides to add
|
||||||
|
new emojies to the list.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
|
||||||
|
def eprint(*args, **kwargs):
|
||||||
|
print(*args, file=sys.stderr, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def get(url, enc='utf-8'):
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(url) as f:
|
||||||
|
return f.read().decode(enc, errors='replace')
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
eprint('Caught', e, 'for', url, '; returning empty')
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
PREFIX_URL = 'http://unicode.org/Public/emoji/'
|
||||||
|
SUFFIX_URL = '/emoji-data.txt', '/emoji-sequences.txt'
|
||||||
|
VERSION_RE = re.compile(r'>(\d+.\d+)/<')
|
||||||
|
OUTPUT_TXT = 'emojies.txt'
|
||||||
|
CODEPOINT_RE = re.compile(r'([\da-fA-F]{3,}(?:[\s.]+[\da-fA-F]{3,}))')
|
||||||
|
EMOJI_START = 0x20e3 # emoji data has many more ranges, falling outside this
|
||||||
|
EMOJI_END = 200000 # from some tests those outside the range aren't emojies
|
||||||
|
|
||||||
|
|
||||||
|
versions = VERSION_RE.findall(get(PREFIX_URL))
|
||||||
|
lines = []
|
||||||
|
if not os.path.isfile(OUTPUT_TXT):
|
||||||
|
with open(OUTPUT_TXT, 'w') as f:
|
||||||
|
for version in versions:
|
||||||
|
for s in SUFFIX_URL:
|
||||||
|
url = PREFIX_URL + version + s
|
||||||
|
for line in get(url).split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if not line or line.startswith('#'):
|
||||||
|
continue
|
||||||
|
m = CODEPOINT_RE.search(line)
|
||||||
|
if m and m.start() == 0:
|
||||||
|
f.write(m.group(1) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
points = set()
|
||||||
|
with open(OUTPUT_TXT) as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if ' ' in line:
|
||||||
|
for p in line.split():
|
||||||
|
i = int(p, 16)
|
||||||
|
if i > 255:
|
||||||
|
points.add(i)
|
||||||
|
elif '.' in line:
|
||||||
|
s, e = line.split('..')
|
||||||
|
for i in range(int(s, 16), int(e, 16) + 1):
|
||||||
|
if i > 255:
|
||||||
|
points.add(i)
|
||||||
|
else:
|
||||||
|
i = int(line, 16)
|
||||||
|
if i > 255:
|
||||||
|
points.add(int(line, 16))
|
||||||
|
|
||||||
|
|
||||||
|
ranges = []
|
||||||
|
points = tuple(sorted(points))
|
||||||
|
start = points[0]
|
||||||
|
last = start
|
||||||
|
for point in points:
|
||||||
|
if point - last > 1:
|
||||||
|
if start == last or not (EMOJI_START < start < EMOJI_END):
|
||||||
|
eprint(
|
||||||
|
'Dropping', last - start + 1,
|
||||||
|
'character(s) from', hex(start), ':', chr(start)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ranges.append((start, last))
|
||||||
|
start = point
|
||||||
|
|
||||||
|
last = point
|
||||||
|
|
||||||
|
|
||||||
|
if start == last or not (EMOJI_START < start < EMOJI_END):
|
||||||
|
eprint(
|
||||||
|
'Dropping', last - start + 1,
|
||||||
|
'character(s) from', hex(start), ':', chr(start)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ranges.append((start, last))
|
||||||
|
|
||||||
|
|
||||||
|
print('EMOJI_RANGES = ({})'.format(', '.join(repr(r) for r in ranges)))
|
Loading…
Reference in New Issue
Block a user