Hyphen infix (#5770)

* infix split on hyphen when preceded by number * clean up * skip ukranian test instead of xfail
2025-09-17 17:42:43 +03:00 · 2020-07-20 14:48:51 +02:00 · 2020-07-20 14:48:51 +02:00 · 1b2ec94382
commit 1b2ec94382
parent ec819fc311
4 changed files with 22 additions and 5 deletions
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -3,6 +3,7 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
 from ...attrs import LANG
@ -20,6 +21,7 @@ class EnglishDefaults(Language.Defaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
    infixes = TOKENIZER_INFIXES
    single_orth_variants = [
        {"tags": ["NFP"], "variants": ["…", "..."]},
        {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
--- a/spacy/lang/en/punctuation.py
+++ b/spacy/lang/en/punctuation.py
@ -0,0 +1,19 @@
 from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 _infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
 )
 TOKENIZER_INFIXES = _infixes
--- a/spacy/tests/lang/en/test_text.py
+++ b/spacy/tests/lang/en/test_text.py
@ -26,9 +26,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
        ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
        ("""'Me too!', Mr. P. Delaware cried. """, 11),
        ("They ran about 10km.", 6),
-        pytest.param(
+        ("But then the 6,000-year ice age came...", 10),
            "But then the 6,000-year ice age came...", 10, marks=pytest.mark.xfail()
        ),
    ],
 )
 def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):
--- a/spacy/tests/lang/fr/test_exceptions.py
+++ b/spacy/tests/lang/fr/test_exceptions.py
@ -16,8 +16,6 @@ import pytest
        "grand'hamien",
        "Châteauneuf-la-Forêt",
        "Château-Guibert",
        "11-septembre",
        "11-Septembre",
        "refox-trottâmes",
        # u"K-POP",
        # u"K-Pop",