Enhance emoji detection

This commit is contained in:
Lonami Exo 2017-10-29 16:41:30 +01:00
parent 368269cb11
commit f5fafc6a27
2 changed files with 121 additions and 9 deletions

View File

@ -22,19 +22,30 @@ class Mode(Enum):
URL = 5
EMOJI_PATTERN = re.compile(
'['
'\U0001F600-\U0001F64F' # emoticons
'\U0001F300-\U0001F5FF' # symbols & pictographs
'\U0001F680-\U0001F6FF' # transport & map symbols
'\U0001F1E0-\U0001F1FF' # flags (iOS)
']+', flags=re.UNICODE
# using telethon_generator/emoji_ranges.py
EMOJI_RANGES = (
(8596, 8601), (8617, 8618), (8986, 8987), (9193, 9203), (9208, 9210),
(9642, 9643), (9723, 9726), (9728, 9733), (9735, 9746), (9748, 9751),
(9754, 9884), (9886, 9905), (9907, 9953), (9956, 9983), (9985, 9988),
(9992, 10002), (10035, 10036), (10067, 10069), (10083, 10087),
(10133, 10135), (10548, 10549), (11013, 11015), (11035, 11036),
(126976, 127166), (127169, 127183), (127185, 127231), (127245, 127247),
(127340, 127345), (127358, 127359), (127377, 127386), (127405, 127487),
(127489, 127503), (127538, 127546), (127548, 127551), (127561, 128419),
(128421, 128591), (128640, 128767), (128884, 128895), (128981, 129023),
(129036, 129039), (129096, 129103), (129114, 129119), (129160, 129167),
(129198, 129338), (129340, 129342), (129344, 129349), (129351, 129355),
(129357, 129471), (129473, 131069)
)
def is_emoji(char):
"""Returns True if 'char' looks like an emoji"""
return bool(EMOJI_PATTERN.match(char))
char = ord(char)
for start, end in EMOJI_RANGES:
if start <= char <= end:
return True
return False
def emojiness(char):
@ -44,7 +55,7 @@ def emojiness(char):
"""
if not is_emoji(char):
return 1
if ord(char) < ord('🤐'):
if ord(char) < 129296:
return 2
else:
return 3

View File

@ -0,0 +1,101 @@
"""
Simple module to allow fetching unicode.org emoji lists and printing a
Python-like tuple out of them.
May not be accurate 100%, and is definitely not as efficient as it could be,
but it should only be ran whenever the Unicode consortium decides to add
new emojies to the list.
"""
import os
import sys
import re
import urllib.error
import urllib.request
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def get(url, enc='utf-8'):
try:
with urllib.request.urlopen(url) as f:
return f.read().decode(enc, errors='replace')
except urllib.error.HTTPError as e:
eprint('Caught', e, 'for', url, '; returning empty')
return ''
PREFIX_URL = 'http://unicode.org/Public/emoji/'
SUFFIX_URL = '/emoji-data.txt', '/emoji-sequences.txt'
VERSION_RE = re.compile(r'>(\d+.\d+)/<')
OUTPUT_TXT = 'emojies.txt'
CODEPOINT_RE = re.compile(r'([\da-fA-F]{3,}(?:[\s.]+[\da-fA-F]{3,}))')
EMOJI_START = 0x20e3 # emoji data has many more ranges, falling outside this
EMOJI_END = 200000 # from some tests those outside the range aren't emojies
versions = VERSION_RE.findall(get(PREFIX_URL))
lines = []
if not os.path.isfile(OUTPUT_TXT):
with open(OUTPUT_TXT, 'w') as f:
for version in versions:
for s in SUFFIX_URL:
url = PREFIX_URL + version + s
for line in get(url).split('\n'):
line = line.strip()
if not line or line.startswith('#'):
continue
m = CODEPOINT_RE.search(line)
if m and m.start() == 0:
f.write(m.group(1) + '\n')
points = set()
with open(OUTPUT_TXT) as f:
for line in f:
line = line.strip()
if ' ' in line:
for p in line.split():
i = int(p, 16)
if i > 255:
points.add(i)
elif '.' in line:
s, e = line.split('..')
for i in range(int(s, 16), int(e, 16) + 1):
if i > 255:
points.add(i)
else:
i = int(line, 16)
if i > 255:
points.add(int(line, 16))
ranges = []
points = tuple(sorted(points))
start = points[0]
last = start
for point in points:
if point - last > 1:
if start == last or not (EMOJI_START < start < EMOJI_END):
eprint(
'Dropping', last - start + 1,
'character(s) from', hex(start), ':', chr(start)
)
else:
ranges.append((start, last))
start = point
last = point
if start == last or not (EMOJI_START < start < EMOJI_END):
eprint(
'Dropping', last - start + 1,
'character(s) from', hex(start), ':', chr(start)
)
else:
ranges.append((start, last))
print('EMOJI_RANGES = ({})'.format(', '.join(repr(r) for r in ranges)))