mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Allow period as suffix following punctuation (#4248)
Addresses rare cases (such as `_MATH_.`, see #1061) where the final period was not recognized as a suffix following punctuation.
This commit is contained in:
parent
3e8f136ba7
commit
c32126359a
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
||||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||||
|
|
||||||
|
|
||||||
_prefixes = (
|
_prefixes = (
|
||||||
|
@ -27,8 +27,8 @@ _suffixes = (
|
||||||
r"(?<=°[FfCcKk])\.",
|
r"(?<=°[FfCcKk])\.",
|
||||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
|
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||||
),
|
),
|
||||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
]
|
]
|
||||||
|
|
|
@ -133,3 +133,9 @@ def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
||||||
assert tokens[6].text == "Puddleton"
|
assert tokens[6].text == "Puddleton"
|
||||||
assert tokens[7].text == "?"
|
assert tokens[7].text == "?"
|
||||||
assert tokens[8].text == "\u2014"
|
assert tokens[8].text == "\u2014"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,length", [("_MATH_", 3), ("_MATH_.", 4)])
|
||||||
|
def test_final_period(en_tokenizer, text, length):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
Loading…
Reference in New Issue
Block a user