mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
04395ffa49
I wrote a small script to read the UD English training data and check that our tag map and morph rules were resulting in the best POS map. This hadn't been done for some time, and there have been various changes to the UD schema since it has been done. After these changes we should see much better agreement between our POS assignments and the UD POS tags.
490 lines
12 KiB
Python
490 lines
12 KiB
Python
# coding: utf8
|
||
from __future__ import unicode_literals
|
||
|
||
from ...symbols import LEMMA, PRON_LEMMA, AUX
|
||
|
||
_subordinating_conjunctions = [
|
||
"that",
|
||
"if",
|
||
"as",
|
||
"because",
|
||
"of",
|
||
"for",
|
||
"before",
|
||
"in",
|
||
"while",
|
||
"after",
|
||
"since",
|
||
"like",
|
||
"with",
|
||
"so",
|
||
"to",
|
||
"by",
|
||
"on",
|
||
"about",
|
||
"than",
|
||
"whether",
|
||
"although",
|
||
"from",
|
||
"though",
|
||
"until",
|
||
"unless",
|
||
"once",
|
||
"without",
|
||
"at",
|
||
"into",
|
||
"cause",
|
||
"over",
|
||
"upon",
|
||
"till",
|
||
"whereas",
|
||
"beyond",
|
||
"whilst",
|
||
"except",
|
||
"despite",
|
||
"wether",
|
||
"then",
|
||
"but",
|
||
"becuse",
|
||
"whie",
|
||
"below",
|
||
"against",
|
||
"it",
|
||
"w/out",
|
||
"toward",
|
||
"albeit",
|
||
"save",
|
||
"besides",
|
||
"becouse",
|
||
"coz",
|
||
"til",
|
||
"ask",
|
||
"i'd",
|
||
"out",
|
||
"near",
|
||
"seince",
|
||
"towards",
|
||
"tho",
|
||
"sice",
|
||
"will",
|
||
]
|
||
|
||
_relative_pronouns = ["this", "that", "those", "these"]
|
||
|
||
MORPH_RULES = {
|
||
"DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
|
||
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
|
||
"NN": {
|
||
"something": {"POS": "PRON"},
|
||
"anyone": {"POS": "PRON"},
|
||
"anything": {"POS": "PRON"},
|
||
"nothing": {"POS": "PRON"},
|
||
"someone": {"POS": "PRON"},
|
||
"everything": {"POS": "PRON"},
|
||
"everyone": {"POS": "PRON"},
|
||
"everybody": {"POS": "PRON"},
|
||
"nobody": {"POS": "PRON"},
|
||
"somebody": {"POS": "PRON"},
|
||
"anybody": {"POS": "PRON"},
|
||
"any1": {"POS": "PRON"},
|
||
},
|
||
"PRP": {
|
||
"I": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "One",
|
||
"Number": "Sing",
|
||
"Case": "Nom",
|
||
},
|
||
"me": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "One",
|
||
"Number": "Sing",
|
||
"Case": "Acc",
|
||
},
|
||
"you": {LEMMA: PRON_LEMMA, "POS": "PRON", "PronType": "Prs", "Person": "Two"},
|
||
"he": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Masc",
|
||
"Case": "Nom",
|
||
},
|
||
"him": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Masc",
|
||
"Case": "Acc",
|
||
},
|
||
"she": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Fem",
|
||
"Case": "Nom",
|
||
},
|
||
"her": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Fem",
|
||
"Case": "Acc",
|
||
},
|
||
"it": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Neut",
|
||
},
|
||
"we": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "One",
|
||
"Number": "Plur",
|
||
"Case": "Nom",
|
||
},
|
||
"us": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "One",
|
||
"Number": "Plur",
|
||
"Case": "Acc",
|
||
},
|
||
"they": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Plur",
|
||
"Case": "Nom",
|
||
},
|
||
"them": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Plur",
|
||
"Case": "Acc",
|
||
},
|
||
"mine": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "One",
|
||
"Number": "Sing",
|
||
"Poss": "Yes",
|
||
"Reflex": "Yes",
|
||
},
|
||
"his": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Masc",
|
||
"Poss": "Yes",
|
||
"Reflex": "Yes",
|
||
},
|
||
"hers": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Fem",
|
||
"Poss": "Yes",
|
||
"Reflex": "Yes",
|
||
},
|
||
"its": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Neut",
|
||
"Poss": "Yes",
|
||
"Reflex": "Yes",
|
||
},
|
||
"ours": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "One",
|
||
"Number": "Plur",
|
||
"Poss": "Yes",
|
||
"Reflex": "Yes",
|
||
},
|
||
"yours": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Two",
|
||
"Number": "Plur",
|
||
"Poss": "Yes",
|
||
"Reflex": "Yes",
|
||
},
|
||
"theirs": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Plur",
|
||
"Poss": "Yes",
|
||
"Reflex": "Yes",
|
||
},
|
||
"myself": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "One",
|
||
"Number": "Sing",
|
||
"Case": "Acc",
|
||
"Reflex": "Yes",
|
||
},
|
||
"yourself": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Two",
|
||
"Case": "Acc",
|
||
"Reflex": "Yes",
|
||
},
|
||
"himself": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Case": "Acc",
|
||
"Gender": "Masc",
|
||
"Reflex": "Yes",
|
||
},
|
||
"herself": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Case": "Acc",
|
||
"Gender": "Fem",
|
||
"Reflex": "Yes",
|
||
},
|
||
"itself": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Case": "Acc",
|
||
"Gender": "Neut",
|
||
"Reflex": "Yes",
|
||
},
|
||
"themself": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Case": "Acc",
|
||
"Reflex": "Yes",
|
||
},
|
||
"ourselves": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "One",
|
||
"Number": "Plur",
|
||
"Case": "Acc",
|
||
"Reflex": "Yes",
|
||
},
|
||
"yourselves": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Two",
|
||
"Case": "Acc",
|
||
"Reflex": "Yes",
|
||
},
|
||
"themselves": {
|
||
LEMMA: PRON_LEMMA,
|
||
"POS": "PRON",
|
||
"PronType": "Prs",
|
||
"Person": "Three",
|
||
"Number": "Plur",
|
||
"Case": "Acc",
|
||
"Reflex": "Yes",
|
||
},
|
||
},
|
||
"PRP$": {
|
||
"my": {
|
||
LEMMA: PRON_LEMMA,
|
||
"Person": "One",
|
||
"Number": "Sing",
|
||
"PronType": "Prs",
|
||
"Poss": "Yes",
|
||
},
|
||
"your": {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"},
|
||
"his": {
|
||
LEMMA: PRON_LEMMA,
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Masc",
|
||
"PronType": "Prs",
|
||
"Poss": "Yes",
|
||
},
|
||
"her": {
|
||
LEMMA: PRON_LEMMA,
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Fem",
|
||
"PronType": "Prs",
|
||
"Poss": "Yes",
|
||
},
|
||
"its": {
|
||
LEMMA: PRON_LEMMA,
|
||
"Person": "Three",
|
||
"Number": "Sing",
|
||
"Gender": "Neut",
|
||
"PronType": "Prs",
|
||
"Poss": "Yes",
|
||
},
|
||
"our": {
|
||
LEMMA: PRON_LEMMA,
|
||
"Person": "One",
|
||
"Number": "Plur",
|
||
"PronType": "Prs",
|
||
"Poss": "Yes",
|
||
},
|
||
"their": {
|
||
LEMMA: PRON_LEMMA,
|
||
"Person": "Three",
|
||
"Number": "Plur",
|
||
"PronType": "Prs",
|
||
"Poss": "Yes",
|
||
},
|
||
},
|
||
"RB": {word: {"POS": "PART"} for word in ["not", "n't", "nt", "n’t"]},
|
||
"VB": {
|
||
word: {"POS": "AUX"}
|
||
for word in ["be", "have", "do", "get", "of", "am", "are", "'ve"]
|
||
},
|
||
"VBN": {"been": {LEMMA: "be", "POS": "AUX"}},
|
||
"VBG": {"being": {LEMMA: "be", "POS": "AUX"}},
|
||
"VBZ": {
|
||
"am": {
|
||
LEMMA: "be",
|
||
"POS": "AUX",
|
||
"VerbForm": "Fin",
|
||
"Person": "One",
|
||
"Tense": "Pres",
|
||
"Mood": "Ind",
|
||
},
|
||
"are": {
|
||
LEMMA: "be",
|
||
"POS": "AUX",
|
||
"VerbForm": "Fin",
|
||
"Person": "Two",
|
||
"Tense": "Pres",
|
||
"Mood": "Ind",
|
||
},
|
||
"is": {
|
||
LEMMA: "be",
|
||
"POS": "AUX",
|
||
"VerbForm": "Fin",
|
||
"Person": "Three",
|
||
"Tense": "Pres",
|
||
"Mood": "Ind",
|
||
},
|
||
"'re": {
|
||
LEMMA: "be",
|
||
"POS": "AUX",
|
||
"VerbForm": "Fin",
|
||
"Person": "Two",
|
||
"Tense": "Pres",
|
||
"Mood": "Ind",
|
||
},
|
||
"'s": {
|
||
LEMMA: "be",
|
||
"POS": "AUX",
|
||
"VerbForm": "Fin",
|
||
"Person": "Three",
|
||
"Tense": "Pres",
|
||
"Mood": "Ind",
|
||
},
|
||
"has": {"POS": "AUX"},
|
||
"does": {"POS": "AUX"},
|
||
},
|
||
"VBP": {
|
||
"are": {
|
||
LEMMA: "be",
|
||
"POS": "AUX",
|
||
"VerbForm": "Fin",
|
||
"Tense": "Pres",
|
||
"Mood": "Ind",
|
||
},
|
||
"'re": {
|
||
LEMMA: "be",
|
||
"POS": "AUX",
|
||
"VerbForm": "Fin",
|
||
"Tense": "Pres",
|
||
"Mood": "Ind",
|
||
},
|
||
"am": {
|
||
LEMMA: "be",
|
||
"POS": "AUX",
|
||
"VerbForm": "Fin",
|
||
"Person": "One",
|
||
"Tense": "Pres",
|
||
"Mood": "Ind",
|
||
},
|
||
"do": {"POS": "AUX"},
|
||
"have": {"POS": "AUX"},
|
||
"'m": {"POS": "AUX", LEMMA: "be"},
|
||
"'ve": {"POS": "AUX"},
|
||
"'re": {"POS": "AUX", LEMMA: "be"},
|
||
"'s": {"POS": "AUX"},
|
||
"is": {"POS": "AUX"},
|
||
"'d": {"POS": "AUX"},
|
||
},
|
||
"VBD": {
|
||
"was": {
|
||
LEMMA: "be",
|
||
"POS": "AUX",
|
||
"VerbForm": "Fin",
|
||
"Tense": "Past",
|
||
"Number": "Sing",
|
||
},
|
||
"were": {
|
||
LEMMA: "be",
|
||
"POS": "AUX",
|
||
"VerbForm": "Fin",
|
||
"Tense": "Past",
|
||
"Number": "Plur",
|
||
},
|
||
"did": {"POS": "AUX"},
|
||
"had": {"POS": "AUX"},
|
||
"'d": {"POS": "AUX"},
|
||
},
|
||
}
|
||
|
||
|
||
for tag, rules in MORPH_RULES.items():
|
||
for key, attrs in dict(rules).items():
|
||
rules[key.title()] = attrs
|