Telethon/telethon_generator/emoji_ranges.py

"""
Simple module to allow fetching unicode.org emoji lists and printing a
Python-like tuple out of them.

May not be accurate 100%, and is definitely not as efficient as it could be,
but it should only be ran whenever the Unicode consortium decides to add
new emojies to the list.
"""
import os
import sys
import re
import urllib.error
import urllib.request


def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def get(url, enc='utf-8'):
    try:
        with urllib.request.urlopen(url) as f:
            return f.read().decode(enc, errors='replace')
    except urllib.error.HTTPError as e:
        eprint('Caught', e, 'for', url, '; returning empty')
        return ''


PREFIX_URL = 'http://unicode.org/Public/emoji/'
SUFFIX_URL = '/emoji-data.txt', '/emoji-sequences.txt'
VERSION_RE = re.compile(r'>(\d+.\d+)/<')
OUTPUT_TXT = 'emojies.txt'
CODEPOINT_RE = re.compile(r'([\da-fA-F]{3,}(?:[\s.]+[\da-fA-F]{3,}))')
EMOJI_START = 0x20e3  # emoji data has many more ranges, falling outside this
EMOJI_END = 200000  # from some tests those outside the range aren't emojies


versions = VERSION_RE.findall(get(PREFIX_URL))
lines = []
if not os.path.isfile(OUTPUT_TXT):
    with open(OUTPUT_TXT, 'w') as f:
        for version in versions:
            for s in SUFFIX_URL:
                url = PREFIX_URL + version + s
                for line in get(url).split('\n'):
                    line = line.strip()
                    if not line or line.startswith('#'):
                        continue
                    m = CODEPOINT_RE.search(line)
                    if m and m.start() == 0:
                        f.write(m.group(1) + '\n')


points = set()
with open(OUTPUT_TXT) as f:
    for line in f:
        line = line.strip()
        if ' ' in line:
            for p in line.split():
                i = int(p, 16)
                if i > 255:
                    points.add(i)
        elif '.' in line:
            s, e = line.split('..')
            for i in range(int(s, 16), int(e, 16) + 1):
                if i > 255:
                    points.add(i)
        else:
            i = int(line, 16)
            if i > 255:
                points.add(int(line, 16))


ranges = []
points = tuple(sorted(points))
start = points[0]
last = start
for point in points:
    if point - last > 1:
        if start == last or not (EMOJI_START < start < EMOJI_END):
            eprint(
                'Dropping', last - start + 1,
                'character(s) from', hex(start), ':', chr(start)
            )
        else:
            ranges.append((start, last))
        start = point

    last = point


if start == last or not (EMOJI_START < start < EMOJI_END):
    eprint(
        'Dropping', last - start + 1,
        'character(s) from', hex(start), ':', chr(start)
    )
else:
    ranges.append((start, last))


print('EMOJI_RANGES = ({})'.format(', '.join(repr(r) for r in ranges)))