mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Remove English exceptions with mismatched features (#10873)
Remove English contraction exceptions with mismatched features that lead to exceptions like "theses" and "thisre".
This commit is contained in:
parent
41389ffe1e
commit
727ce6d1f5
|
@ -35,7 +35,7 @@ for pron in ["i"]:
|
||||||
|
|
||||||
_exc[orth + "m"] = [
|
_exc[orth + "m"] = [
|
||||||
{ORTH: orth, NORM: pron},
|
{ORTH: orth, NORM: pron},
|
||||||
{ORTH: "m", "tenspect": 1, "number": 1},
|
{ORTH: "m"},
|
||||||
]
|
]
|
||||||
|
|
||||||
_exc[orth + "'ma"] = [
|
_exc[orth + "'ma"] = [
|
||||||
|
@ -139,26 +139,27 @@ for pron in ["he", "she", "it"]:
|
||||||
|
|
||||||
# W-words, relative pronouns, prepositions etc.
|
# W-words, relative pronouns, prepositions etc.
|
||||||
|
|
||||||
for word in [
|
for word, morph in [
|
||||||
"who",
|
("who", None),
|
||||||
"what",
|
("what", None),
|
||||||
"when",
|
("when", None),
|
||||||
"where",
|
("where", None),
|
||||||
"why",
|
("why", None),
|
||||||
"how",
|
("how", None),
|
||||||
"there",
|
("there", None),
|
||||||
"that",
|
("that", "Number=Sing|Person=3"),
|
||||||
"this",
|
("this", "Number=Sing|Person=3"),
|
||||||
"these",
|
("these", "Number=Plur|Person=3"),
|
||||||
"those",
|
("those", "Number=Plur|Person=3"),
|
||||||
]:
|
]:
|
||||||
for orth in [word, word.title()]:
|
for orth in [word, word.title()]:
|
||||||
_exc[orth + "'s"] = [
|
if morph != "Number=Plur|Person=3":
|
||||||
{ORTH: orth, NORM: word},
|
_exc[orth + "'s"] = [
|
||||||
{ORTH: "'s", NORM: "'s"},
|
{ORTH: orth, NORM: word},
|
||||||
]
|
{ORTH: "'s", NORM: "'s"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
|
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
|
||||||
|
|
||||||
_exc[orth + "'ll"] = [
|
_exc[orth + "'ll"] = [
|
||||||
{ORTH: orth, NORM: word},
|
{ORTH: orth, NORM: word},
|
||||||
|
@ -182,25 +183,26 @@ for word in [
|
||||||
{ORTH: "ve", NORM: "have"},
|
{ORTH: "ve", NORM: "have"},
|
||||||
]
|
]
|
||||||
|
|
||||||
_exc[orth + "'re"] = [
|
if morph != "Number=Sing|Person=3":
|
||||||
{ORTH: orth, NORM: word},
|
_exc[orth + "'re"] = [
|
||||||
{ORTH: "'re", NORM: "are"},
|
{ORTH: orth, NORM: word},
|
||||||
]
|
{ORTH: "'re", NORM: "are"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "re"] = [
|
_exc[orth + "re"] = [
|
||||||
{ORTH: orth, NORM: word},
|
{ORTH: orth, NORM: word},
|
||||||
{ORTH: "re", NORM: "are"},
|
{ORTH: "re", NORM: "are"},
|
||||||
]
|
]
|
||||||
|
|
||||||
_exc[orth + "'ve"] = [
|
_exc[orth + "'ve"] = [
|
||||||
{ORTH: orth, NORM: word},
|
{ORTH: orth, NORM: word},
|
||||||
{ORTH: "'ve"},
|
{ORTH: "'ve"},
|
||||||
]
|
]
|
||||||
|
|
||||||
_exc[orth + "ve"] = [
|
_exc[orth + "ve"] = [
|
||||||
{ORTH: orth},
|
{ORTH: orth},
|
||||||
{ORTH: "ve", NORM: "have"},
|
{ORTH: "ve", NORM: "have"},
|
||||||
]
|
]
|
||||||
|
|
||||||
_exc[orth + "'d"] = [
|
_exc[orth + "'d"] = [
|
||||||
{ORTH: orth, NORM: word},
|
{ORTH: orth, NORM: word},
|
||||||
|
|
|
@ -167,3 +167,12 @@ def test_issue3521(en_tokenizer, word):
|
||||||
tok = en_tokenizer(word)[1]
|
tok = en_tokenizer(word)[1]
|
||||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||||
assert tok.is_stop
|
assert tok.is_stop
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(10699)
|
||||||
|
@pytest.mark.parametrize("text", ["theses", "thisre"])
|
||||||
|
def test_issue10699(en_tokenizer, text):
|
||||||
|
"""Test that 'theses' and 'thisre' are excluded from the contractions
|
||||||
|
generated by the English tokenizer exceptions."""
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user