Remove English exceptions with mismatched features (#10873)

Remove English contraction exceptions with mismatched features that lead
to exceptions like "theses" and "thisre".
This commit is contained in:
Adriane Boyd 2022-06-03 09:44:04 +02:00 committed by GitHub
parent 41389ffe1e
commit 727ce6d1f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 45 additions and 34 deletions

View File

@ -35,7 +35,7 @@ for pron in ["i"]:
_exc[orth + "m"] = [
{ORTH: orth, NORM: pron},
{ORTH: "m", "tenspect": 1, "number": 1},
{ORTH: "m"},
]
_exc[orth + "'ma"] = [
@ -139,20 +139,21 @@ for pron in ["he", "she", "it"]:
# W-words, relative pronouns, prepositions etc.
for word in [
"who",
"what",
"when",
"where",
"why",
"how",
"there",
"that",
"this",
"these",
"those",
for word, morph in [
("who", None),
("what", None),
("when", None),
("where", None),
("why", None),
("how", None),
("there", None),
("that", "Number=Sing|Person=3"),
("this", "Number=Sing|Person=3"),
("these", "Number=Plur|Person=3"),
("those", "Number=Plur|Person=3"),
]:
for orth in [word, word.title()]:
if morph != "Number=Plur|Person=3":
_exc[orth + "'s"] = [
{ORTH: orth, NORM: word},
{ORTH: "'s", NORM: "'s"},
@ -182,6 +183,7 @@ for word in [
{ORTH: "ve", NORM: "have"},
]
if morph != "Number=Sing|Person=3":
_exc[orth + "'re"] = [
{ORTH: orth, NORM: word},
{ORTH: "'re", NORM: "are"},

View File

@ -167,3 +167,12 @@ def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms
assert tok.is_stop
@pytest.mark.issue(10699)
@pytest.mark.parametrize("text", ["theses", "thisre"])
def test_issue10699(en_tokenizer, text):
"""Test that 'theses' and 'thisre' are excluded from the contractions
generated by the English tokenizer exceptions."""
tokens = en_tokenizer(text)
assert len(tokens) == 1