Allow period as suffix following punctuation (#4248)

Addresses rare cases (such as `_MATH_.`, see #1061) where the final period was not recognized as a suffix following punctuation.
2025-12-04 00:34:27 +03:00 · 2019-09-09 19:19:22 +02:00 · 2019-09-09 19:19:22 +02:00 · c32126359a
commit c32126359a
parent 3e8f136ba7
2 changed files with 9 additions and 3 deletions
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals
 from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
 from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
-from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 _prefixes = (
@ -27,8 +27,8 @@ _suffixes = (
        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
-        r"(?<=[0-9{al}{e}(?:{q})])\.".format(
+        r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
-            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
+            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
        ),
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
    ]
--- a/spacy/tests/lang/en/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py
@ -133,3 +133,9 @@ def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
    assert tokens[6].text == "Puddleton"
    assert tokens[7].text == "?"
    assert tokens[8].text == "\u2014"
@pytest.mark.parametrize("text,length", [("_MATH_", 3), ("_MATH_.", 4)])
 def test_final_period(en_tokenizer, text, length):
    tokens = en_tokenizer(text)
    assert len(tokens) == length