mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 01:32:32 +03:00
Merge tokenizer exceptions from PR #802
This commit is contained in:
parent
85f951ca99
commit
5d706ab95d
|
@ -13039,7 +13039,6 @@ Javerlhac-et-la-Chapelle-Saint-Robert
|
||||||
Javron-les-Chapelles
|
Javron-les-Chapelles
|
||||||
JAX-RPC
|
JAX-RPC
|
||||||
JAX-RS
|
JAX-RS
|
||||||
J.-C.
|
|
||||||
Jeannois-Mitissien
|
Jeannois-Mitissien
|
||||||
jeans-de-gand
|
jeans-de-gand
|
||||||
jeans-de-janten
|
jeans-de-janten
|
||||||
|
|
|
@ -75,15 +75,49 @@ def get_tokenizer_exceptions():
|
||||||
"déc.": [
|
"déc.": [
|
||||||
{LEMMA: "décembre", ORTH: "déc."}
|
{LEMMA: "décembre", ORTH: "déc."}
|
||||||
],
|
],
|
||||||
|
"apr.": [
|
||||||
|
{LEMMA: "après", ORTH: "apr."}
|
||||||
|
],
|
||||||
|
"J.-C.": [
|
||||||
|
{LEMMA: "Jésus", ORTH: "J."},
|
||||||
|
{LEMMA: "Christ", ORTH: "-C."}
|
||||||
|
],
|
||||||
|
"Dr.": [
|
||||||
|
{LEMMA: "docteur", ORTH: "Dr."}
|
||||||
|
],
|
||||||
|
"M.": [
|
||||||
|
{LEMMA: "monsieur", ORTH: "M."}
|
||||||
|
],
|
||||||
|
"Mr.": [
|
||||||
|
{LEMMA: "monsieur", ORTH: "Mr."}
|
||||||
|
],
|
||||||
|
"Mme.": [
|
||||||
|
{LEMMA: "madame", ORTH: "Mme."}
|
||||||
|
],
|
||||||
|
"Mlle.": [
|
||||||
|
{LEMMA: "mademoiselle", ORTH: "Mlle."}
|
||||||
|
],
|
||||||
|
"n°": [
|
||||||
|
{LEMMA: "numéro", ORTH: "n°"}
|
||||||
|
],
|
||||||
|
"d°": [
|
||||||
|
{LEMMA: "degrés", ORTH: "d°"}
|
||||||
|
],
|
||||||
|
"St.": [
|
||||||
|
{LEMMA: "saint", ORTH: "St."}
|
||||||
|
],
|
||||||
|
"Ste.": [
|
||||||
|
{LEMMA: "sainte", ORTH: "Ste."}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
ABBREVIATIONS_2 = [
|
ABBREVIATIONS_2 = [
|
||||||
"Dr.",
|
|
||||||
"etc.",
|
"etc.",
|
||||||
]
|
]
|
||||||
|
|
||||||
VERBS = {}
|
VERBS = {}
|
||||||
for verb, verb_lemma in (("a", "avoir"), ("est", "être"), ("semble", "sembler"), ("indique", "indiquer"),
|
for verb, verb_lemma in (("a", "avoir"), ("est", "être"),
|
||||||
|
("semble", "sembler"), ("indique", "indiquer"),
|
||||||
("moque", "moquer"), ("passe", "passer")):
|
("moque", "moquer"), ("passe", "passer")):
|
||||||
for pronoun in ("elle", "il", "on"):
|
for pronoun in ("elle", "il", "on"):
|
||||||
token = "{}-t-{}".format(verb, pronoun)
|
token = "{}-t-{}".format(verb, pronoun)
|
||||||
|
@ -98,7 +132,8 @@ def get_tokenizer_exceptions():
|
||||||
{LEMMA: 'ce', ORTH: '-ce'}
|
{LEMMA: 'ce', ORTH: '-ce'}
|
||||||
]
|
]
|
||||||
|
|
||||||
for pre, pre_lemma in (("qu'", "que"), ("Qu'", "Que"), ("n'", "ne"), ("N'", "Ne")):
|
for pre, pre_lemma in (("qu'", "que"), ("Qu'", "Que"), ("n'", "ne"),
|
||||||
|
("N'", "Ne")):
|
||||||
VERBS['{}est-ce'.format(pre)] = [
|
VERBS['{}est-ce'.format(pre)] = [
|
||||||
{LEMMA: pre_lemma, ORTH: pre},
|
{LEMMA: pre_lemma, ORTH: pre},
|
||||||
{LEMMA: 'être', ORTH: "est"},
|
{LEMMA: 'être', ORTH: "est"},
|
||||||
|
|
Loading…
Reference in New Issue
Block a user