From 604f299cf6398fa490acc48c55f7999e935283ac Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 8 May 2017 23:59:33 +0200 Subject: [PATCH] Add char classes to global language data --- spacy/lang/char_classes.py | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 spacy/lang/char_classes.py diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py new file mode 100644 index 000000000..5b81eddde --- /dev/null +++ b/spacy/lang/char_classes.py @@ -0,0 +1,49 @@ +# coding: utf8 +from __future__ import unicode_literals + +import regex as re + + +re.DEFAULT_VERSION = re.VERSION1 +merge_char_classes = lambda classes: '[{}]'.format('||'.join(classes)) +split_chars = lambda char: list(char.strip().split(' ')) +merge_chars = lambda char: char.strip().replace(' ', '|') + + +_bengali = r'[\p{L}&&\p{Bengali}]' +_hebrew = r'[\p{L}&&\p{Hebrew}]' +_latin_lower = r'[\p{Ll}&&\p{Latin}]' +_latin_upper = r'[\p{Lu}&&\p{Latin}]' +_latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]' + +_upper = [_latin_upper] +_lower = [_latin_lower] +_uncased = [_bengali, _hebrew] + + +ALPHA = merge_char_classes(_upper + _lower + _uncased) +ALPHA_LOWER = merge_char_classes(_lower + _uncased) +ALPHA_UPPER = merge_char_classes(_upper + _uncased) + + +_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft ' + 'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' + 'TB T G M K') +_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' +_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' +_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «' +_hyphens = '- – — -- ---' + + +UNITS = merge_chars(_units) +CURRENCY = merge_chars(_currency) +QUOTES = merge_chars(_quotes) +PUNCT = merge_chars(_punct) +HYPHENS = merge_chars(_hyphens) + +LIST_UNITS = split_chars(_units) +LIST_CURRENCY = split_chars(_currency) +LIST_QUOTES = split_chars(_quotes) +LIST_PUNCT = split_chars(_punct) +LIST_HYPHENS = split_chars(_hyphens) +LIST_ELLIPSES = [r'\.\.+', '…']