diff --git a/spacy/fr/tokenizer_exceptions.py b/spacy/fr/tokenizer_exceptions.py index 43806e270..eef7d789d 100644 --- a/spacy/fr/tokenizer_exceptions.py +++ b/spacy/fr/tokenizer_exceptions.py @@ -13,7 +13,7 @@ from ..symbols import * import os import io -import re +import regex as re def get_exceptions(): diff --git a/spacy/hu/tokenizer_exceptions.py b/spacy/hu/tokenizer_exceptions.py index 85cf72ec9..a6dc47511 100644 --- a/spacy/hu/tokenizer_exceptions.py +++ b/spacy/hu/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -import re +import regex as re from spacy.language_data.punctuation import ALPHA_LOWER, CURRENCY from ..language_data.tokenizer_exceptions import _URL_PATTERN diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index f94d91e80..f23b15bbc 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -1,21 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -import re - - -_ALPHA_LOWER = """ -a ä à á â ǎ æ ã å ā ă ą b c ç ć č ĉ ċ c̄ d ð ď e é è ê ë ė ȅ ȩ ẽ ę f g ĝ ğ h i ı -î ï í ī ì ȉ ǐ į ĩ j k ķ l ł ļ m n ñ ń ň ņ o ö ó ò ő ô õ œ ø ō ő ǒ ơ p q r ř ŗ s -ß ś š ş ŝ t ť u ú û ù ú ū ű ǔ ů ų ư v w ŵ x y ÿ ý ỳ ŷ ỹ z ź ž ż þ -""" - - -_ALPHA_UPPER = """ -A Ä À Á  Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Ė Ȅ Ȩ Ẽ Ę F G Ĝ Ğ H I İ -Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S -Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ỳ Ŷ Ỹ Z Ź Ž Ż Þ -""" +import regex as re +re.DEFAULT_VERSION = re.VERSION1 _UNITS = """ @@ -57,9 +44,16 @@ LIST_PUNCT = list(_PUNCT.strip().split()) LIST_HYPHENS = list(_HYPHENS.strip().split()) -ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '').replace('\n', '') -ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace('\n', '') -ALPHA = ALPHA_LOWER + ALPHA_UPPER +BENGALI = r'[\p{L}&&\p{Bengali}]' +HEBREW = r'[\p{L}&&\p{Hebrew}]' +LATIN_LOWER = r'[\p{Ll}&&\p{Latin}]' +LATIN_UPPER = r'[\p{Lu}&&\p{Latin}]' +LATIN = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]' + + +ALPHA_LOWER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_LOWER])) +ALPHA_UPPER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_UPPER])) +ALPHA = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN])) QUOTES = _QUOTES.strip().replace(' ', '|') diff --git a/spacy/tests/he/test_tokenizer.py b/spacy/tests/he/test_tokenizer.py index c2504a0e7..62ae84223 100644 --- a/spacy/tests/he/test_tokenizer.py +++ b/spacy/tests/he/test_tokenizer.py @@ -13,7 +13,7 @@ def test_tokenizer_handles_abbreviation(he_tokenizer, text, expected_tokens): @pytest.mark.parametrize('text,expected_tokens', [ - pytest.mark.xfail(('עקבת אחריו בכל רחבי המדינה.', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '.'])), + ('עקבת אחריו בכל רחבי המדינה.', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '.']), ('עקבת אחריו בכל רחבי המדינה?', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '?']), ('עקבת אחריו בכל רחבי המדינה!', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '!']), ('עקבת אחריו בכל רחבי המדינה..', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '..']), diff --git a/spacy/util.py b/spacy/util.py index 0ccdfbd72..0c7136522 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals, print_function import ujson -import re +import regex as re from pathlib import Path import sys import textwrap