Telethon/telethon_generator/emoji_ranges.py
2017-10-29 16:41:30 +01:00

102 lines
2.8 KiB
Python

"""
Simple module to allow fetching unicode.org emoji lists and printing a
Python-like tuple out of them.
May not be accurate 100%, and is definitely not as efficient as it could be,
but it should only be ran whenever the Unicode consortium decides to add
new emojies to the list.
"""
import os
import sys
import re
import urllib.error
import urllib.request
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def get(url, enc='utf-8'):
try:
with urllib.request.urlopen(url) as f:
return f.read().decode(enc, errors='replace')
except urllib.error.HTTPError as e:
eprint('Caught', e, 'for', url, '; returning empty')
return ''
PREFIX_URL = 'http://unicode.org/Public/emoji/'
SUFFIX_URL = '/emoji-data.txt', '/emoji-sequences.txt'
VERSION_RE = re.compile(r'>(\d+.\d+)/<')
OUTPUT_TXT = 'emojies.txt'
CODEPOINT_RE = re.compile(r'([\da-fA-F]{3,}(?:[\s.]+[\da-fA-F]{3,}))')
EMOJI_START = 0x20e3 # emoji data has many more ranges, falling outside this
EMOJI_END = 200000 # from some tests those outside the range aren't emojies
versions = VERSION_RE.findall(get(PREFIX_URL))
lines = []
if not os.path.isfile(OUTPUT_TXT):
with open(OUTPUT_TXT, 'w') as f:
for version in versions:
for s in SUFFIX_URL:
url = PREFIX_URL + version + s
for line in get(url).split('\n'):
line = line.strip()
if not line or line.startswith('#'):
continue
m = CODEPOINT_RE.search(line)
if m and m.start() == 0:
f.write(m.group(1) + '\n')
points = set()
with open(OUTPUT_TXT) as f:
for line in f:
line = line.strip()
if ' ' in line:
for p in line.split():
i = int(p, 16)
if i > 255:
points.add(i)
elif '.' in line:
s, e = line.split('..')
for i in range(int(s, 16), int(e, 16) + 1):
if i > 255:
points.add(i)
else:
i = int(line, 16)
if i > 255:
points.add(int(line, 16))
ranges = []
points = tuple(sorted(points))
start = points[0]
last = start
for point in points:
if point - last > 1:
if start == last or not (EMOJI_START < start < EMOJI_END):
eprint(
'Dropping', last - start + 1,
'character(s) from', hex(start), ':', chr(start)
)
else:
ranges.append((start, last))
start = point
last = point
if start == last or not (EMOJI_START < start < EMOJI_END):
eprint(
'Dropping', last - start + 1,
'character(s) from', hex(start), ':', chr(start)
)
else:
ranges.append((start, last))
print('EMOJI_RANGES = ({})'.format(', '.join(repr(r) for r in ranges)))