Allow period as suffix following punctuation (#4248)

Addresses rare cases (such as `_MATH_.`, see #1061) where the final
period was not recognized as a suffix following punctuation.
This commit is contained in:
adrianeboyd 2019-09-09 19:19:22 +02:00 committed by Matthew Honnibal
parent 3e8f136ba7
commit c32126359a
2 changed files with 9 additions and 3 deletions

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
_prefixes = (
@ -27,8 +27,8 @@ _suffixes = (
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]

View File

@ -133,3 +133,9 @@ def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
assert tokens[6].text == "Puddleton"
assert tokens[7].text == "?"
assert tokens[8].text == "\u2014"
@pytest.mark.parametrize("text,length", [("_MATH_", 3), ("_MATH_.", 4)])
def test_final_period(en_tokenizer, text, length):
tokens = en_tokenizer(text)
assert len(tokens) == length