From c32126359ae203368e5ea254503fc732171572cd Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 9 Sep 2019 19:19:22 +0200 Subject: [PATCH] Allow period as suffix following punctuation (#4248) Addresses rare cases (such as `_MATH_.`, see #1061) where the final period was not recognized as a suffix following punctuation. --- spacy/lang/punctuation.py | 6 +++--- spacy/tests/lang/en/test_prefix_suffix_infix.py | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index 5969be22e..ccb72de28 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS -from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA +from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT _prefixes = ( @@ -27,8 +27,8 @@ _suffixes = ( r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{u})".format(u=UNITS), - r"(?<=[0-9{al}{e}(?:{q})])\.".format( - al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), ] diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py index e9d75111d..3dccd6bcf 100644 --- a/spacy/tests/lang/en/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py @@ -133,3 +133,9 @@ def test_en_tokenizer_splits_em_dash_infix(en_tokenizer): assert tokens[6].text == "Puddleton" assert tokens[7].text == "?" assert tokens[8].text == "\u2014" + + +@pytest.mark.parametrize("text,length", [("_MATH_", 3), ("_MATH_.", 4)]) +def test_final_period(en_tokenizer, text, length): + tokens = en_tokenizer(text) + assert len(tokens) == length