mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-30 20:06:30 +03:00
eddeb36c96
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
293 lines
10 KiB
Python
293 lines
10 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
def explain(term):
|
|
"""Get a description for a given POS tag, dependency label or entity type.
|
|
|
|
term (unicode): The term to explain.
|
|
RETURNS (unicode): The explanation, or `None` if not found in the glossary.
|
|
|
|
EXAMPLE:
|
|
>>> spacy.explain(u'NORP')
|
|
>>> doc = nlp(u'Hello world')
|
|
>>> print([w.text, w.tag_, spacy.explain(w.tag_) for w in doc])
|
|
"""
|
|
if term in GLOSSARY:
|
|
return GLOSSARY[term]
|
|
|
|
|
|
GLOSSARY = {
|
|
# POS tags
|
|
# Universal POS Tags
|
|
# http://universaldependencies.org/u/pos/
|
|
"ADJ": "adjective",
|
|
"ADP": "adposition",
|
|
"ADV": "adverb",
|
|
"AUX": "auxiliary",
|
|
"CONJ": "conjunction",
|
|
"CCONJ": "coordinating conjunction",
|
|
"DET": "determiner",
|
|
"INTJ": "interjection",
|
|
"NOUN": "noun",
|
|
"NUM": "numeral",
|
|
"PART": "particle",
|
|
"PRON": "pronoun",
|
|
"PROPN": "proper noun",
|
|
"PUNCT": "punctuation",
|
|
"SCONJ": "subordinating conjunction",
|
|
"SYM": "symbol",
|
|
"VERB": "verb",
|
|
"X": "other",
|
|
"EOL": "end of line",
|
|
"SPACE": "space",
|
|
# POS tags (English)
|
|
# OntoNotes 5 / Penn Treebank
|
|
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
|
|
".": "punctuation mark, sentence closer",
|
|
",": "punctuation mark, comma",
|
|
"-LRB-": "left round bracket",
|
|
"-RRB-": "right round bracket",
|
|
"``": "opening quotation mark",
|
|
'""': "closing quotation mark",
|
|
"''": "closing quotation mark",
|
|
":": "punctuation mark, colon or ellipsis",
|
|
"$": "symbol, currency",
|
|
"#": "symbol, number sign",
|
|
"AFX": "affix",
|
|
"CC": "conjunction, coordinating",
|
|
"CD": "cardinal number",
|
|
"DT": "determiner",
|
|
"EX": "existential there",
|
|
"FW": "foreign word",
|
|
"HYPH": "punctuation mark, hyphen",
|
|
"IN": "conjunction, subordinating or preposition",
|
|
"JJ": "adjective",
|
|
"JJR": "adjective, comparative",
|
|
"JJS": "adjective, superlative",
|
|
"LS": "list item marker",
|
|
"MD": "verb, modal auxiliary",
|
|
"NIL": "missing tag",
|
|
"NN": "noun, singular or mass",
|
|
"NNP": "noun, proper singular",
|
|
"NNPS": "noun, proper plural",
|
|
"NNS": "noun, plural",
|
|
"PDT": "predeterminer",
|
|
"POS": "possessive ending",
|
|
"PRP": "pronoun, personal",
|
|
"PRP$": "pronoun, possessive",
|
|
"RB": "adverb",
|
|
"RBR": "adverb, comparative",
|
|
"RBS": "adverb, superlative",
|
|
"RP": "adverb, particle",
|
|
"TO": "infinitival to",
|
|
"UH": "interjection",
|
|
"VB": "verb, base form",
|
|
"VBD": "verb, past tense",
|
|
"VBG": "verb, gerund or present participle",
|
|
"VBN": "verb, past participle",
|
|
"VBP": "verb, non-3rd person singular present",
|
|
"VBZ": "verb, 3rd person singular present",
|
|
"WDT": "wh-determiner",
|
|
"WP": "wh-pronoun, personal",
|
|
"WP$": "wh-pronoun, possessive",
|
|
"WRB": "wh-adverb",
|
|
"SP": "space",
|
|
"ADD": "email",
|
|
"NFP": "superfluous punctuation",
|
|
"GW": "additional word in multi-word expression",
|
|
"XX": "unknown",
|
|
"BES": 'auxiliary "be"',
|
|
"HVS": 'forms of "have"',
|
|
# POS Tags (German)
|
|
# TIGER Treebank
|
|
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
|
"$(": "other sentence-internal punctuation mark",
|
|
"$,": "comma",
|
|
"$.": "sentence-final punctuation mark",
|
|
"ADJA": "adjective, attributive",
|
|
"ADJD": "adjective, adverbial or predicative",
|
|
"APPO": "postposition",
|
|
"APPR": "preposition; circumposition left",
|
|
"APPRART": "preposition with article",
|
|
"APZR": "circumposition right",
|
|
"ART": "definite or indefinite article",
|
|
"CARD": "cardinal number",
|
|
"FM": "foreign language material",
|
|
"ITJ": "interjection",
|
|
"KOKOM": "comparative conjunction",
|
|
"KON": "coordinate conjunction",
|
|
"KOUI": 'subordinate conjunction with "zu" and infinitive',
|
|
"KOUS": "subordinate conjunction with sentence",
|
|
"NE": "proper noun",
|
|
"NNE": "proper noun",
|
|
"PAV": "pronominal adverb",
|
|
"PROAV": "pronominal adverb",
|
|
"PDAT": "attributive demonstrative pronoun",
|
|
"PDS": "substituting demonstrative pronoun",
|
|
"PIAT": "attributive indefinite pronoun without determiner",
|
|
"PIDAT": "attributive indefinite pronoun with determiner",
|
|
"PIS": "substituting indefinite pronoun",
|
|
"PPER": "non-reflexive personal pronoun",
|
|
"PPOSAT": "attributive possessive pronoun",
|
|
"PPOSS": "substituting possessive pronoun",
|
|
"PRELAT": "attributive relative pronoun",
|
|
"PRELS": "substituting relative pronoun",
|
|
"PRF": "reflexive personal pronoun",
|
|
"PTKA": "particle with adjective or adverb",
|
|
"PTKANT": "answer particle",
|
|
"PTKNEG": "negative particle",
|
|
"PTKVZ": "separable verbal particle",
|
|
"PTKZU": '"zu" before infinitive',
|
|
"PWAT": "attributive interrogative pronoun",
|
|
"PWAV": "adverbial interrogative or relative pronoun",
|
|
"PWS": "substituting interrogative pronoun",
|
|
"TRUNC": "word remnant",
|
|
"VAFIN": "finite verb, auxiliary",
|
|
"VAIMP": "imperative, auxiliary",
|
|
"VAINF": "infinitive, auxiliary",
|
|
"VAPP": "perfect participle, auxiliary",
|
|
"VMFIN": "finite verb, modal",
|
|
"VMINF": "infinitive, modal",
|
|
"VMPP": "perfect participle, modal",
|
|
"VVFIN": "finite verb, full",
|
|
"VVIMP": "imperative, full",
|
|
"VVINF": "infinitive, full",
|
|
"VVIZU": 'infinitive with "zu", full',
|
|
"VVPP": "perfect participle, full",
|
|
"XY": "non-word containing non-letter",
|
|
# Noun chunks
|
|
"NP": "noun phrase",
|
|
"PP": "prepositional phrase",
|
|
"VP": "verb phrase",
|
|
"ADVP": "adverb phrase",
|
|
"ADJP": "adjective phrase",
|
|
"SBAR": "subordinating conjunction",
|
|
"PRT": "particle",
|
|
"PNP": "prepositional noun phrase",
|
|
# Dependency Labels (English)
|
|
# ClearNLP / Universal Dependencies
|
|
# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
|
|
"acomp": "adjectival complement",
|
|
"advcl": "adverbial clause modifier",
|
|
"advmod": "adverbial modifier",
|
|
"agent": "agent",
|
|
"amod": "adjectival modifier",
|
|
"appos": "appositional modifier",
|
|
"attr": "attribute",
|
|
"aux": "auxiliary",
|
|
"auxpass": "auxiliary (passive)",
|
|
"cc": "coordinating conjunction",
|
|
"ccomp": "clausal complement",
|
|
"complm": "complementizer",
|
|
"conj": "conjunct",
|
|
"cop": "copula",
|
|
"csubj": "clausal subject",
|
|
"csubjpass": "clausal subject (passive)",
|
|
"dep": "unclassified dependent",
|
|
"det": "determiner",
|
|
"dobj": "direct object",
|
|
"expl": "expletive",
|
|
"hmod": "modifier in hyphenation",
|
|
"hyph": "hyphen",
|
|
"infmod": "infinitival modifier",
|
|
"intj": "interjection",
|
|
"iobj": "indirect object",
|
|
"mark": "marker",
|
|
"meta": "meta modifier",
|
|
"neg": "negation modifier",
|
|
"nmod": "modifier of nominal",
|
|
"nn": "noun compound modifier",
|
|
"npadvmod": "noun phrase as adverbial modifier",
|
|
"nsubj": "nominal subject",
|
|
"nsubjpass": "nominal subject (passive)",
|
|
"num": "number modifier",
|
|
"number": "number compound modifier",
|
|
"oprd": "object predicate",
|
|
"obj": "object",
|
|
"obl": "oblique nominal",
|
|
"parataxis": "parataxis",
|
|
"partmod": "participal modifier",
|
|
"pcomp": "complement of preposition",
|
|
"pobj": "object of preposition",
|
|
"poss": "possession modifier",
|
|
"possessive": "possessive modifier",
|
|
"preconj": "pre-correlative conjunction",
|
|
"prep": "prepositional modifier",
|
|
"prt": "particle",
|
|
"punct": "punctuation",
|
|
"quantmod": "modifier of quantifier",
|
|
"rcmod": "relative clause modifier",
|
|
"root": "root",
|
|
"xcomp": "open clausal complement",
|
|
# Dependency labels (German)
|
|
# TIGER Treebank
|
|
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
|
# currently missing: 'cc' (comparative complement) because of conflict
|
|
# with English labels
|
|
"ac": "adpositional case marker",
|
|
"adc": "adjective component",
|
|
"ag": "genitive attribute",
|
|
"ams": "measure argument of adjective",
|
|
"app": "apposition",
|
|
"avc": "adverbial phrase component",
|
|
"cd": "coordinating conjunction",
|
|
"cj": "conjunct",
|
|
"cm": "comparative conjunction",
|
|
"cp": "complementizer",
|
|
"cvc": "collocational verb construction",
|
|
"da": "dative",
|
|
"dh": "discourse-level head",
|
|
"dm": "discourse marker",
|
|
"ep": "expletive es",
|
|
"hd": "head",
|
|
"ju": "junctor",
|
|
"mnr": "postnominal modifier",
|
|
"mo": "modifier",
|
|
"ng": "negation",
|
|
"nk": "noun kernel element",
|
|
"nmc": "numerical component",
|
|
"oa": "accusative object",
|
|
"oc": "clausal object",
|
|
"og": "genitive object",
|
|
"op": "prepositional object",
|
|
"par": "parenthetical element",
|
|
"pd": "predicate",
|
|
"pg": "phrasal genitive",
|
|
"ph": "placeholder",
|
|
"pm": "morphological particle",
|
|
"pnc": "proper noun component",
|
|
"rc": "relative clause",
|
|
"re": "repeated element",
|
|
"rs": "reported speech",
|
|
"sb": "subject",
|
|
# Named Entity Recognition
|
|
# OntoNotes 5
|
|
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
|
|
"PERSON": "People, including fictional",
|
|
"NORP": "Nationalities or religious or political groups",
|
|
"FACILITY": "Buildings, airports, highways, bridges, etc.",
|
|
"FAC": "Buildings, airports, highways, bridges, etc.",
|
|
"ORG": "Companies, agencies, institutions, etc.",
|
|
"GPE": "Countries, cities, states",
|
|
"LOC": "Non-GPE locations, mountain ranges, bodies of water",
|
|
"PRODUCT": "Objects, vehicles, foods, etc. (not services)",
|
|
"EVENT": "Named hurricanes, battles, wars, sports events, etc.",
|
|
"WORK_OF_ART": "Titles of books, songs, etc.",
|
|
"LAW": "Named documents made into laws.",
|
|
"LANGUAGE": "Any named language",
|
|
"DATE": "Absolute or relative dates or periods",
|
|
"TIME": "Times smaller than a day",
|
|
"PERCENT": 'Percentage, including "%"',
|
|
"MONEY": "Monetary values, including unit",
|
|
"QUANTITY": "Measurements, as of weight or distance",
|
|
"ORDINAL": '"first", "second", etc.',
|
|
"CARDINAL": "Numerals that do not fall under another type",
|
|
# Named Entity Recognition
|
|
# Wikipedia
|
|
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
|
|
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
|
|
"PER": "Named person or family.",
|
|
"MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
|
|
}
|