mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Allow period as suffix following punctuation (#4248)
Addresses rare cases (such as `_MATH_.`, see #1061) where the final period was not recognized as a suffix following punctuation.
This commit is contained in:
parent
3e8f136ba7
commit
c32126359a
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||
|
||||
|
||||
_prefixes = (
|
||||
|
@ -27,8 +27,8 @@ _suffixes = (
|
|||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||
),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
]
|
||||
|
|
|
@ -133,3 +133,9 @@ def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
|||
assert tokens[6].text == "Puddleton"
|
||||
assert tokens[7].text == "?"
|
||||
assert tokens[8].text == "\u2014"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [("_MATH_", 3), ("_MATH_.", 4)])
|
||||
def test_final_period(en_tokenizer, text, length):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
Loading…
Reference in New Issue
Block a user