diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 9626704da..26cdb4509 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -3,6 +3,7 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS +from .punctuation import TOKENIZER_INFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language from ...attrs import LANG @@ -20,6 +21,7 @@ class EnglishDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS + infixes = TOKENIZER_INFIXES single_orth_variants = [ {"tags": ["NFP"], "variants": ["…", "..."]}, {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, diff --git a/spacy/lang/en/punctuation.py b/spacy/lang/en/punctuation.py new file mode 100644 index 000000000..67e3e80e5 --- /dev/null +++ b/spacy/lang/en/punctuation.py @@ -0,0 +1,19 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py index c5d56d885..4d4d0a643 100644 --- a/spacy/tests/lang/en/test_text.py +++ b/spacy/tests/lang/en/test_text.py @@ -26,9 +26,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15), ("""'Me too!', Mr. P. Delaware cried. """, 11), ("They ran about 10km.", 6), - pytest.param( - "But then the 6,000-year ice age came...", 10, marks=pytest.mark.xfail() - ), + ("But then the 6,000-year ice age came...", 10), ], ) def test_en_tokenizer_handles_cnts(en_tokenizer, text, length): diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py index 98d318f6e..91c0a0a4d 100644 --- a/spacy/tests/lang/fr/test_exceptions.py +++ b/spacy/tests/lang/fr/test_exceptions.py @@ -16,8 +16,6 @@ import pytest "grand'hamien", "Châteauneuf-la-Forêt", "Château-Guibert", - "11-septembre", - "11-Septembre", "refox-trottâmes", # u"K-POP", # u"K-Pop",