💫 Add en/em dash to prefixes and suffixes (#3281)

* Auto-format * Add en/em dash to prefixes and suffixes
2025-11-22 18:55:43 +03:00 · 2019-02-15 10:29:59 +01:00 · 2019-02-15 10:29:59 +01:00 · c31a9dabd5
commit c31a9dabd5
parent 5651a0d052
2 changed files with 20 additions and 8 deletions
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -1,14 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals
-from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS
+from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import HYPHENS
+from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
 from .char_classes import CURRENCY, UNITS
 from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 _prefixes = (
-    ["§", "%", "=", r"\+(?![0-9])"]
+    ["§", "%", "=", "—", "–", r"\+(?![0-9])"]
    + LIST_PUNCT
    + LIST_ELLIPSES
    + LIST_QUOTES
@ -22,13 +21,15 @@ _suffixes = (
    + LIST_ELLIPSES
    + LIST_QUOTES
    + LIST_ICONS
-    + ["'s", "'S", "’s", "’S"]
+    + ["'s", "'S", "’s", "’S", "—", "–"]
    + [
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
-        r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
+        r"(?<=[0-9{al}{e}(?:{q})])\.".format(
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
        ),
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
    ]
 )
@ -40,8 +41,8 @@ _infixes = (
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
-        r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
-        r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
+        r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
 )
--- a/spacy/tests/regression/test_issue3277.py
+++ b/spacy/tests/regression/test_issue3277.py
@ -0,0 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 def test_issue3277(es_tokenizer):
    """Test that hyphens are split correctly as prefixes."""
    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
    assert len(doc) == 14
    assert doc[0].text == "\u2014"
    assert doc[5].text == "\u2013"
    assert doc[9].text == "\u2013"