spaCy/spacy/lang/en/morph_rules.py
Matthew Honnibal 04395ffa49 Bring English tag_map in line with UD Treebank
I wrote a small script to read the UD English training data and check
that our tag map and morph rules were resulting in the best POS map.
This hadn't been done for some time, and there have been various changes
to the UD schema since it has been done. After these changes we should
see much better agreement between our POS assignments and the UD POS
tags.
2019-03-21 13:53:44 +01:00

490 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf8
from __future__ import unicode_literals
from ...symbols import LEMMA, PRON_LEMMA, AUX
_subordinating_conjunctions = [
"that",
"if",
"as",
"because",
"of",
"for",
"before",
"in",
"while",
"after",
"since",
"like",
"with",
"so",
"to",
"by",
"on",
"about",
"than",
"whether",
"although",
"from",
"though",
"until",
"unless",
"once",
"without",
"at",
"into",
"cause",
"over",
"upon",
"till",
"whereas",
"beyond",
"whilst",
"except",
"despite",
"wether",
"then",
"but",
"becuse",
"whie",
"below",
"against",
"it",
"w/out",
"toward",
"albeit",
"save",
"besides",
"becouse",
"coz",
"til",
"ask",
"i'd",
"out",
"near",
"seince",
"towards",
"tho",
"sice",
"will",
]
_relative_pronouns = ["this", "that", "those", "these"]
MORPH_RULES = {
"DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
"NN": {
"something": {"POS": "PRON"},
"anyone": {"POS": "PRON"},
"anything": {"POS": "PRON"},
"nothing": {"POS": "PRON"},
"someone": {"POS": "PRON"},
"everything": {"POS": "PRON"},
"everyone": {"POS": "PRON"},
"everybody": {"POS": "PRON"},
"nobody": {"POS": "PRON"},
"somebody": {"POS": "PRON"},
"anybody": {"POS": "PRON"},
"any1": {"POS": "PRON"},
},
"PRP": {
"I": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Nom",
},
"me": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Acc",
},
"you": {LEMMA: PRON_LEMMA, "POS": "PRON", "PronType": "Prs", "Person": "Two"},
"he": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Nom",
},
"him": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Acc",
},
"she": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Case": "Nom",
},
"her": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Case": "Acc",
},
"it": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
},
"we": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Nom",
},
"us": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Acc",
},
"they": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Nom",
},
"them": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Acc",
},
"mine": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Reflex": "Yes",
},
"his": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Poss": "Yes",
"Reflex": "Yes",
},
"hers": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Poss": "Yes",
"Reflex": "Yes",
},
"its": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
"Poss": "Yes",
"Reflex": "Yes",
},
"ours": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"yours": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"theirs": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"myself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Acc",
"Reflex": "Yes",
},
"yourself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Two",
"Case": "Acc",
"Reflex": "Yes",
},
"himself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Masc",
"Reflex": "Yes",
},
"herself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Fem",
"Reflex": "Yes",
},
"itself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Neut",
"Reflex": "Yes",
},
"themself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Reflex": "Yes",
},
"ourselves": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Acc",
"Reflex": "Yes",
},
"yourselves": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Two",
"Case": "Acc",
"Reflex": "Yes",
},
"themselves": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Acc",
"Reflex": "Yes",
},
},
"PRP$": {
"my": {
LEMMA: PRON_LEMMA,
"Person": "One",
"Number": "Sing",
"PronType": "Prs",
"Poss": "Yes",
},
"your": {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"},
"his": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"PronType": "Prs",
"Poss": "Yes",
},
"her": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"PronType": "Prs",
"Poss": "Yes",
},
"its": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
"PronType": "Prs",
"Poss": "Yes",
},
"our": {
LEMMA: PRON_LEMMA,
"Person": "One",
"Number": "Plur",
"PronType": "Prs",
"Poss": "Yes",
},
"their": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Plur",
"PronType": "Prs",
"Poss": "Yes",
},
},
"RB": {word: {"POS": "PART"} for word in ["not", "n't", "nt", "nt"]},
"VB": {
word: {"POS": "AUX"}
for word in ["be", "have", "do", "get", "of", "am", "are", "'ve"]
},
"VBN": {"been": {LEMMA: "be", "POS": "AUX"}},
"VBG": {"being": {LEMMA: "be", "POS": "AUX"}},
"VBZ": {
"am": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "One",
"Tense": "Pres",
"Mood": "Ind",
},
"are": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Two",
"Tense": "Pres",
"Mood": "Ind",
},
"is": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Three",
"Tense": "Pres",
"Mood": "Ind",
},
"'re": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Two",
"Tense": "Pres",
"Mood": "Ind",
},
"'s": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Three",
"Tense": "Pres",
"Mood": "Ind",
},
"has": {"POS": "AUX"},
"does": {"POS": "AUX"},
},
"VBP": {
"are": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Pres",
"Mood": "Ind",
},
"'re": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Pres",
"Mood": "Ind",
},
"am": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "One",
"Tense": "Pres",
"Mood": "Ind",
},
"do": {"POS": "AUX"},
"have": {"POS": "AUX"},
"'m": {"POS": "AUX", LEMMA: "be"},
"'ve": {"POS": "AUX"},
"'re": {"POS": "AUX", LEMMA: "be"},
"'s": {"POS": "AUX"},
"is": {"POS": "AUX"},
"'d": {"POS": "AUX"},
},
"VBD": {
"was": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Past",
"Number": "Sing",
},
"were": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Past",
"Number": "Plur",
},
"did": {"POS": "AUX"},
"had": {"POS": "AUX"},
"'d": {"POS": "AUX"},
},
}
for tag, rules in MORPH_RULES.items():
for key, attrs in dict(rules).items():
rules[key.title()] = attrs