Remove English exceptions with mismatched features (#10873)

Remove English contraction exceptions with mismatched features that lead
to exceptions like "theses" and "thisre".
This commit is contained in:
Adriane Boyd 2022-06-03 09:44:04 +02:00 committed by GitHub
parent 41389ffe1e
commit 727ce6d1f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 45 additions and 34 deletions

View File

@ -35,7 +35,7 @@ for pron in ["i"]:
_exc[orth + "m"] = [ _exc[orth + "m"] = [
{ORTH: orth, NORM: pron}, {ORTH: orth, NORM: pron},
{ORTH: "m", "tenspect": 1, "number": 1}, {ORTH: "m"},
] ]
_exc[orth + "'ma"] = [ _exc[orth + "'ma"] = [
@ -139,26 +139,27 @@ for pron in ["he", "she", "it"]:
# W-words, relative pronouns, prepositions etc. # W-words, relative pronouns, prepositions etc.
for word in [ for word, morph in [
"who", ("who", None),
"what", ("what", None),
"when", ("when", None),
"where", ("where", None),
"why", ("why", None),
"how", ("how", None),
"there", ("there", None),
"that", ("that", "Number=Sing|Person=3"),
"this", ("this", "Number=Sing|Person=3"),
"these", ("these", "Number=Plur|Person=3"),
"those", ("those", "Number=Plur|Person=3"),
]: ]:
for orth in [word, word.title()]: for orth in [word, word.title()]:
_exc[orth + "'s"] = [ if morph != "Number=Plur|Person=3":
{ORTH: orth, NORM: word}, _exc[orth + "'s"] = [
{ORTH: "'s", NORM: "'s"}, {ORTH: orth, NORM: word},
] {ORTH: "'s", NORM: "'s"},
]
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}] _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, NORM: word}, {ORTH: orth, NORM: word},
@ -182,25 +183,26 @@ for word in [
{ORTH: "ve", NORM: "have"}, {ORTH: "ve", NORM: "have"},
] ]
_exc[orth + "'re"] = [ if morph != "Number=Sing|Person=3":
{ORTH: orth, NORM: word}, _exc[orth + "'re"] = [
{ORTH: "'re", NORM: "are"}, {ORTH: orth, NORM: word},
] {ORTH: "'re", NORM: "are"},
]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "re", NORM: "are"}, {ORTH: "re", NORM: "are"},
] ]
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'ve"}, {ORTH: "'ve"},
] ]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth}, {ORTH: orth},
{ORTH: "ve", NORM: "have"}, {ORTH: "ve", NORM: "have"},
] ]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, NORM: word}, {ORTH: orth, NORM: word},

View File

@ -167,3 +167,12 @@ def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1] tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms # 'not' and 'would' should be stopwords, also in their abbreviated forms
assert tok.is_stop assert tok.is_stop
@pytest.mark.issue(10699)
@pytest.mark.parametrize("text", ["theses", "thisre"])
def test_issue10699(en_tokenizer, text):
"""Test that 'theses' and 'thisre' are excluded from the contractions
generated by the English tokenizer exceptions."""
tokens = en_tokenizer(text)
assert len(tokens) == 1