mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Remove English exceptions with mismatched features (#10873)
Remove English contraction exceptions with mismatched features that lead to exceptions like "theses" and "thisre".
This commit is contained in:
parent
41389ffe1e
commit
727ce6d1f5
|
@ -35,7 +35,7 @@ for pron in ["i"]:
|
|||
|
||||
_exc[orth + "m"] = [
|
||||
{ORTH: orth, NORM: pron},
|
||||
{ORTH: "m", "tenspect": 1, "number": 1},
|
||||
{ORTH: "m"},
|
||||
]
|
||||
|
||||
_exc[orth + "'ma"] = [
|
||||
|
@ -139,26 +139,27 @@ for pron in ["he", "she", "it"]:
|
|||
|
||||
# W-words, relative pronouns, prepositions etc.
|
||||
|
||||
for word in [
|
||||
"who",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"why",
|
||||
"how",
|
||||
"there",
|
||||
"that",
|
||||
"this",
|
||||
"these",
|
||||
"those",
|
||||
for word, morph in [
|
||||
("who", None),
|
||||
("what", None),
|
||||
("when", None),
|
||||
("where", None),
|
||||
("why", None),
|
||||
("how", None),
|
||||
("there", None),
|
||||
("that", "Number=Sing|Person=3"),
|
||||
("this", "Number=Sing|Person=3"),
|
||||
("these", "Number=Plur|Person=3"),
|
||||
("those", "Number=Plur|Person=3"),
|
||||
]:
|
||||
for orth in [word, word.title()]:
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'s", NORM: "'s"},
|
||||
]
|
||||
if morph != "Number=Plur|Person=3":
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'s", NORM: "'s"},
|
||||
]
|
||||
|
||||
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
|
||||
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
|
||||
|
||||
_exc[orth + "'ll"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
|
@ -182,25 +183,26 @@ for word in [
|
|||
{ORTH: "ve", NORM: "have"},
|
||||
]
|
||||
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'re", NORM: "are"},
|
||||
]
|
||||
if morph != "Number=Sing|Person=3":
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'re", NORM: "are"},
|
||||
]
|
||||
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "re", NORM: "are"},
|
||||
]
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "re", NORM: "are"},
|
||||
]
|
||||
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'ve"},
|
||||
]
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'ve"},
|
||||
]
|
||||
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth},
|
||||
{ORTH: "ve", NORM: "have"},
|
||||
]
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth},
|
||||
{ORTH: "ve", NORM: "have"},
|
||||
]
|
||||
|
||||
_exc[orth + "'d"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
|
|
|
@ -167,3 +167,12 @@ def test_issue3521(en_tokenizer, word):
|
|||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
assert tok.is_stop
|
||||
|
||||
|
||||
@pytest.mark.issue(10699)
|
||||
@pytest.mark.parametrize("text", ["theses", "thisre"])
|
||||
def test_issue10699(en_tokenizer, text):
|
||||
"""Test that 'theses' and 'thisre' are excluded from the contractions
|
||||
generated by the English tokenizer exceptions."""
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
Loading…
Reference in New Issue
Block a user