mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Update tag maps and docs for English and German (#4501)
* Update English tag_map
Update English tag_map based on this conversion table:
https://universaldependencies.org/tagset-conversion/en-penn-uposf.html
* Update German tag_map
Update German tag_map based on this conversion table:
https://universaldependencies.org/tagset-conversion/de-stts-uposf.html
* Add missing Tiger dependencies to glossary
* Add quotes to definition of TO
* Update POS/TAG tables in docs
Update POS/TAG tables for English and German docs using current
information generated from the tag_maps and GLOSSARY.
* Update warning that -PRON- is specific to English
* Revert docs to default JSON output with convert
* Revert "Revert docs to default JSON output with convert"
This reverts commit 6b78c048f1
.
This commit is contained in:
parent
10d88b09bb
commit
1b0bbe4b76
|
@ -80,7 +80,7 @@ GLOSSARY = {
|
|||
"RBR": "adverb, comparative",
|
||||
"RBS": "adverb, superlative",
|
||||
"RP": "adverb, particle",
|
||||
"TO": "infinitival to",
|
||||
"TO": 'infinitival "to"',
|
||||
"UH": "interjection",
|
||||
"VB": "verb, base form",
|
||||
"VBD": "verb, past tense",
|
||||
|
@ -279,6 +279,12 @@ GLOSSARY = {
|
|||
"re": "repeated element",
|
||||
"rs": "reported speech",
|
||||
"sb": "subject",
|
||||
"sb": "subject",
|
||||
"sbp": "passivized subject (PP)",
|
||||
"sp": "subject or predicate",
|
||||
"svp": "separable verb prefix",
|
||||
"uc": "unit component",
|
||||
"vo": "vocative",
|
||||
# Named Entity Recognition
|
||||
# OntoNotes 5
|
||||
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
|
||||
from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, VERB
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
|
@ -20,8 +20,8 @@ TAG_MAP = {
|
|||
"CARD": {POS: NUM, "NumType": "card"},
|
||||
"FM": {POS: X, "Foreign": "yes"},
|
||||
"ITJ": {POS: INTJ},
|
||||
"KOKOM": {POS: CONJ, "ConjType": "comp"},
|
||||
"KON": {POS: CONJ},
|
||||
"KOKOM": {POS: CCONJ, "ConjType": "comp"},
|
||||
"KON": {POS: CCONJ},
|
||||
"KOUI": {POS: SCONJ},
|
||||
"KOUS": {POS: SCONJ},
|
||||
"NE": {POS: PROPN},
|
||||
|
@ -43,7 +43,7 @@ TAG_MAP = {
|
|||
"PTKA": {POS: PART},
|
||||
"PTKANT": {POS: PART, "PartType": "res"},
|
||||
"PTKNEG": {POS: PART, "Polarity": "neg"},
|
||||
"PTKVZ": {POS: PART, "PartType": "vbp"},
|
||||
"PTKVZ": {POS: ADP, "PartType": "vbp"},
|
||||
"PTKZU": {POS: PART, "PartType": "inf"},
|
||||
"PWAT": {POS: DET, "PronType": "int"},
|
||||
"PWAV": {POS: ADV, "PronType": "int"},
|
||||
|
|
|
@ -28,8 +28,8 @@ TAG_MAP = {
|
|||
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||
"LS": {POS: X, "NumType": "ord"},
|
||||
"MD": {POS: AUX, "VerbType": "mod"},
|
||||
"NIL": {POS: ""},
|
||||
"MD": {POS: VERB, "VerbType": "mod"},
|
||||
"NIL": {POS: X},
|
||||
"NN": {POS: NOUN, "Number": "sing"},
|
||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||
|
@ -37,7 +37,7 @@ TAG_MAP = {
|
|||
"PDT": {POS: DET},
|
||||
"POS": {POS: PART, "Poss": "yes"},
|
||||
"PRP": {POS: PRON, "PronType": "prs"},
|
||||
"PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"},
|
||||
"PRP$": {POS: DET, "PronType": "prs", "Poss": "yes"},
|
||||
"RB": {POS: ADV, "Degree": "pos"},
|
||||
"RBR": {POS: ADV, "Degree": "comp"},
|
||||
"RBS": {POS: ADV, "Degree": "sup"},
|
||||
|
@ -58,9 +58,9 @@ TAG_MAP = {
|
|||
"Number": "sing",
|
||||
"Person": "three",
|
||||
},
|
||||
"WDT": {POS: PRON},
|
||||
"WDT": {POS: DET},
|
||||
"WP": {POS: PRON},
|
||||
"WP$": {POS: PRON, "Poss": "yes"},
|
||||
"WP$": {POS: DET, "Poss": "yes"},
|
||||
"WRB": {POS: ADV},
|
||||
"ADD": {POS: X},
|
||||
"NFP": {POS: PUNCT},
|
||||
|
|
|
@ -48,14 +48,14 @@ be installed if needed via `pip install spacy[lookups]`. Some languages provide
|
|||
full lemmatization rules and exceptions, while other languages currently only
|
||||
rely on simple lookup tables.
|
||||
|
||||
<Infobox title="About spaCy's custom pronoun lemma" variant="warning">
|
||||
<Infobox title="About spaCy's custom pronoun lemma for English" variant="warning">
|
||||
|
||||
spaCy adds a **special case for pronouns**: all pronouns are lemmatized to the
|
||||
special token `-PRON-`. Unlike verbs and common nouns, there's no clear base
|
||||
form of a personal pronoun. Should the lemma of "me" be "I", or should we
|
||||
normalize person as well, giving "it" — or maybe "he"? spaCy's solution is to
|
||||
introduce a novel symbol, `-PRON-`, which is used as the lemma for all personal
|
||||
pronouns.
|
||||
spaCy adds a **special case for English pronouns**: all English pronouns are
|
||||
lemmatized to the special token `-PRON-`. Unlike verbs and common nouns,
|
||||
there's no clear base form of a personal pronoun. Should the lemma of "me" be
|
||||
"I", or should we normalize person as well, giving "it" — or maybe "he"?
|
||||
spaCy's solution is to introduce a novel symbol, `-PRON-`, which is used as the
|
||||
lemma for all personal pronouns.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -117,76 +117,72 @@ type. They're available as the [`Token.pos`](/api/token#attributes) and
|
|||
|
||||
The English part-of-speech tagger uses the
|
||||
[OntoNotes 5](https://catalog.ldc.upenn.edu/LDC2013T19) version of the Penn
|
||||
Treebank tag set. We also map the tags to the simpler Google Universal POS tag
|
||||
set.
|
||||
|
||||
| Tag | POS | Morphology | Description |
|
||||
| ----------------------------------- | ------- | ---------------------------------------------- | ----------------------------------------- |
|
||||
| `-LRB-` | `PUNCT` | `PunctType=brck PunctSide=ini` | left round bracket |
|
||||
| `-RRB-` | `PUNCT` | `PunctType=brck PunctSide=fin` | right round bracket |
|
||||
| `,` | `PUNCT` | `PunctType=comm` | punctuation mark, comma |
|
||||
| `:` | `PUNCT` | | punctuation mark, colon or ellipsis |
|
||||
| `.` | `PUNCT` | `PunctType=peri` | punctuation mark, sentence closer |
|
||||
| `''` | `PUNCT` | `PunctType=quot PunctSide=fin` | closing quotation mark |
|
||||
| `""` | `PUNCT` | `PunctType=quot PunctSide=fin` | closing quotation mark |
|
||||
| <InlineCode>``</InlineCode> | `PUNCT` | `PunctType=quot PunctSide=ini` | opening quotation mark |
|
||||
| `#` | `SYM` | `SymType=numbersign` | symbol, number sign |
|
||||
| `$` | `SYM` | `SymType=currency` | symbol, currency |
|
||||
| `ADD` | `X` | | email |
|
||||
| `AFX` | `ADJ` | `Hyph=yes` | affix |
|
||||
| `BES` | `VERB` | | auxiliary "be" |
|
||||
| `CC` | `CONJ` | `ConjType=coor` | conjunction, coordinating |
|
||||
| `CD` | `NUM` | `NumType=card` | cardinal number |
|
||||
| `DT` | `DET` | | determiner |
|
||||
| `EX` | `ADV` | `AdvType=ex` | existential there |
|
||||
| `FW` | `X` | `Foreign=yes` | foreign word |
|
||||
| `GW` | `X` | | additional word in multi-word expression |
|
||||
| `HVS` | `VERB` | | forms of "have" |
|
||||
| `HYPH` | `PUNCT` | `PunctType=dash` | punctuation mark, hyphen |
|
||||
| `IN` | `ADP` | | conjunction, subordinating or preposition |
|
||||
| `JJ` | `ADJ` | `Degree=pos` | adjective |
|
||||
| `JJR` | `ADJ` | `Degree=comp` | adjective, comparative |
|
||||
| `JJS` | `ADJ` | `Degree=sup` | adjective, superlative |
|
||||
| `LS` | `PUNCT` | `NumType=ord` | list item marker |
|
||||
| `MD` | `VERB` | `VerbType=mod` | verb, modal auxiliary |
|
||||
| `NFP` | `PUNCT` | | superfluous punctuation |
|
||||
| `NIL` | | | missing tag |
|
||||
| `NN` | `NOUN` | `Number=sing` | noun, singular or mass |
|
||||
| `NNP` | `PROPN` | `NounType=prop Number=sign` | noun, proper singular |
|
||||
| `NNPS` | `PROPN` | `NounType=prop Number=plur` | noun, proper plural |
|
||||
| `NNS` | `NOUN` | `Number=plur` | noun, plural |
|
||||
| `PDT` | `ADJ` | `AdjType=pdt PronType=prn` | predeterminer |
|
||||
| `POS` | `PART` | `Poss=yes` | possessive ending |
|
||||
| `PRP` | `PRON` | `PronType=prs` | pronoun, personal |
|
||||
| `PRP$` | `ADJ` | `PronType=prs Poss=yes` | pronoun, possessive |
|
||||
| `RB` | `ADV` | `Degree=pos` | adverb |
|
||||
| `RBR` | `ADV` | `Degree=comp` | adverb, comparative |
|
||||
| `RBS` | `ADV` | `Degree=sup` | adverb, superlative |
|
||||
| `RP` | `PART` | | adverb, particle |
|
||||
| `_SP` | `SPACE` | | space |
|
||||
| `SYM` | `SYM` | | symbol |
|
||||
| `TO` | `PART` | `PartType=inf VerbForm=inf` | infinitival "to" |
|
||||
| `UH` | `INTJ` | | interjection |
|
||||
| `VB` | `VERB` | `VerbForm=inf` | verb, base form |
|
||||
| `VBD` | `VERB` | `VerbForm=fin Tense=past` | verb, past tense |
|
||||
| `VBG` | `VERB` | `VerbForm=part Tense=pres Aspect=prog` | verb, gerund or present participle |
|
||||
| `VBN` | `VERB` | `VerbForm=part Tense=past Aspect=perf` | verb, past participle |
|
||||
| `VBP` | `VERB` | `VerbForm=fin Tense=pres` | verb, non-3rd person singular present |
|
||||
| `VBZ` | `VERB` | `VerbForm=fin Tense=pres Number=sing Person=3` | verb, 3rd person singular present |
|
||||
| `WDT` | `ADJ` | `PronType=int|rel` | wh-determiner |
|
||||
| `WP` | `NOUN` | `PronType=int|rel` | wh-pronoun, personal |
|
||||
| `WP$` | `ADJ` | `Poss=yes PronType=int|rel` | wh-pronoun, possessive |
|
||||
| `WRB` | `ADV` | `PronType=int|rel` | wh-adverb |
|
||||
| `XX` | `X` | | unknown |
|
||||
Treebank tag set. We also map the tags to the simpler Universal Dependencies v2
|
||||
POS tag set.
|
||||
|
||||
| Tag | POS | Morphology | Description |
|
||||
| ------------------------------------- | ------- | --------------------------------------- | ----------------------------------------- |
|
||||
| `$` | `SYM` | | symbol, currency |
|
||||
| <InlineCode>``</InlineCode> | `PUNCT` | `PunctType=quot PunctSide=ini` | opening quotation mark |
|
||||
| `''` | `PUNCT` | `PunctType=quot PunctSide=fin` | closing quotation mark |
|
||||
| `,` | `PUNCT` | `PunctType=comm` | punctuation mark, comma |
|
||||
| `-LRB-` | `PUNCT` | `PunctType=brck PunctSide=ini` | left round bracket |
|
||||
| `-RRB-` | `PUNCT` | `PunctType=brck PunctSide=fin` | right round bracket |
|
||||
| `.` | `PUNCT` | `PunctType=peri` | punctuation mark, sentence closer |
|
||||
| `:` | `PUNCT` | | punctuation mark, colon or ellipsis |
|
||||
| `ADD` | `X` | | email |
|
||||
| `AFX` | `ADJ` | `Hyph=yes` | affix |
|
||||
| `CC` | `CCONJ` | `ConjType=comp` | conjunction, coordinating |
|
||||
| `CD` | `NUM` | `NumType=card` | cardinal number |
|
||||
| `DT` | `DET` | | determiner |
|
||||
| `EX` | `PRON` | `AdvType=ex` | existential there |
|
||||
| `FW` | `X` | `Foreign=yes` | foreign word |
|
||||
| `GW` | `X` | | additional word in multi-word expression |
|
||||
| `HYPH` | `PUNCT` | `PunctType=dash` | punctuation mark, hyphen |
|
||||
| `IN` | `ADP` | | conjunction, subordinating or preposition |
|
||||
| `JJ` | `ADJ` | `Degree=pos` | adjective |
|
||||
| `JJR` | `ADJ` | `Degree=comp` | adjective, comparative |
|
||||
| `JJS` | `ADJ` | `Degree=sup` | adjective, superlative |
|
||||
| `LS` | `X` | `NumType=ord` | list item marker |
|
||||
| `MD` | `VERB` | `VerbType=mod` | verb, modal auxiliary |
|
||||
| `NFP` | `PUNCT` | | superfluous punctuation |
|
||||
| `NIL` | `X` | | missing tag |
|
||||
| `NN` | `NOUN` | `Number=sing` | noun, singular or mass |
|
||||
| `NNP` | `PROPN` | `NounType=prop Number=sing` | noun, proper singular |
|
||||
| `NNPS` | `PROPN` | `NounType=prop Number=plur` | noun, proper plural |
|
||||
| `NNS` | `NOUN` | `Number=plur` | noun, plural |
|
||||
| `PDT` | `DET` | | predeterminer |
|
||||
| `POS` | `PART` | `Poss=yes` | possessive ending |
|
||||
| `PRP` | `PRON` | `PronType=prs` | pronoun, personal |
|
||||
| `PRP$` | `DET` | `PronType=prs Poss=yes` | pronoun, possessive |
|
||||
| `RB` | `ADV` | `Degree=pos` | adverb |
|
||||
| `RBR` | `ADV` | `Degree=comp` | adverb, comparative |
|
||||
| `RBS` | `ADV` | `Degree=sup` | adverb, superlative |
|
||||
| `RP` | `ADP` | | adverb, particle |
|
||||
| `SP` | `SPACE` | | space |
|
||||
| `SYM` | `SYM` | | symbol |
|
||||
| `TO` | `PART` | `PartType=inf VerbForm=inf` | infinitival "to" |
|
||||
| `UH` | `INTJ` | | interjection |
|
||||
| `VB` | `VERB` | `VerbForm=inf` | verb, base form |
|
||||
| `VBD` | `VERB` | `VerbForm=fin Tense=past` | verb, past tense |
|
||||
| `VBG` | `VERB` | `VerbForm=part Tense=pres Aspect=prog` | verb, gerund or present participle |
|
||||
| `VBN` | `VERB` | `VerbForm=part Tense=past Aspect=perf` | verb, past participle |
|
||||
| `VBP` | `VERB` | `VerbForm=fin Tense=pres` | verb, non-3rd person singular present |
|
||||
| `VBZ` | `VERB` | `VerbForm=fin Tense=pres Number=sing Person=three` | verb, 3rd person singular present |
|
||||
| `WDT` | `DET` | | wh-determiner |
|
||||
| `WP` | `PRON` | | wh-pronoun, personal |
|
||||
| `WP$` | `DET` | `Poss=yes` | wh-pronoun, possessive |
|
||||
| `WRB` | `ADV` | | wh-adverb |
|
||||
| `XX` | `X` | | unknown |
|
||||
| `_SP` | `SPACE` | | |
|
||||
</Accordion>
|
||||
|
||||
<Accordion title="German" id="pos-de">
|
||||
|
||||
The German part-of-speech tagger uses the
|
||||
[TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html)
|
||||
annotation scheme. We also map the tags to the simpler Google Universal POS tag
|
||||
set.
|
||||
annotation scheme. We also map the tags to the simpler Universal Dependencies
|
||||
v2 POS tag set.
|
||||
|
||||
| Tag | POS | Morphology | Description |
|
||||
| --------- | ------- | ---------------------------------------- | ------------------------------------------------- |
|
||||
|
@ -194,7 +190,7 @@ set.
|
|||
| `$,` | `PUNCT` | `PunctType=comm` | comma |
|
||||
| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark |
|
||||
| `ADJA` | `ADJ` | | adjective, attributive |
|
||||
| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative |
|
||||
| `ADJD` | `ADJ` | | adjective, adverbial or predicative |
|
||||
| `ADV` | `ADV` | | adverb |
|
||||
| `APPO` | `ADP` | `AdpType=post` | postposition |
|
||||
| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left |
|
||||
|
@ -204,28 +200,28 @@ set.
|
|||
| `CARD` | `NUM` | `NumType=card` | cardinal number |
|
||||
| `FM` | `X` | `Foreign=yes` | foreign language material |
|
||||
| `ITJ` | `INTJ` | | interjection |
|
||||
| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction |
|
||||
| `KON` | `CONJ` | | coordinate conjunction |
|
||||
| `KOKOM` | `CCONJ` | `ConjType=comp` | comparative conjunction |
|
||||
| `KON` | `CCONJ` | | coordinate conjunction |
|
||||
| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive |
|
||||
| `KOUS` | `SCONJ` | | subordinate conjunction with sentence |
|
||||
| `NE` | `PROPN` | | proper noun |
|
||||
| `NNE` | `PROPN` | | proper noun |
|
||||
| `NN` | `NOUN` | | noun, singular or mass |
|
||||
| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb |
|
||||
| `NNE` | `PROPN` | | proper noun |
|
||||
| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun |
|
||||
| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun |
|
||||
| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner |
|
||||
| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun |
|
||||
| `PIAT` | `DET` | `PronType=ind|neg|tot` | attributive indefinite pronoun without determiner |
|
||||
| `PIS` | `PRON` | `PronType=ind|neg|tot` | substituting indefinite pronoun |
|
||||
| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun |
|
||||
| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun |
|
||||
| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun |
|
||||
| `PPOSS` | `PRON` | `Poss=yes PronType=prs` | substituting possessive pronoun |
|
||||
| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun |
|
||||
| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun |
|
||||
| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun |
|
||||
| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb |
|
||||
| `PTKA` | `PART` | | particle with adjective or adverb |
|
||||
| `PTKANT` | `PART` | `PartType=res` | answer particle |
|
||||
| `PTKNEG` | `PART` | `Negative=yes` | negative particle |
|
||||
| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle |
|
||||
| `PTKNEG` | `PART` | `Polarity=neg` | negative particle |
|
||||
| `PTKVZ` | `ADP` | `PartType=vbp` | separable verbal particle |
|
||||
| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive |
|
||||
| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun |
|
||||
| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun |
|
||||
|
@ -234,9 +230,9 @@ set.
|
|||
| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary |
|
||||
| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary |
|
||||
| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary |
|
||||
| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary |
|
||||
| `VAPP` | `AUX` | `Aspect=perf VerbForm=part` | perfect participle, auxiliary |
|
||||
| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal |
|
||||
| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal |
|
||||
| `VMINF` | `VERB` | `VerbForm=inf VerbType=mod` | infinitive, modal |
|
||||
| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal |
|
||||
| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full |
|
||||
| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full |
|
||||
|
@ -244,8 +240,7 @@ set.
|
|||
| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full |
|
||||
| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full |
|
||||
| `XY` | `X` | | non-word containing non-letter |
|
||||
| `SP` | `SPACE` | | space |
|
||||
|
||||
| `_SP` | `SPACE` | | |
|
||||
</Accordion>
|
||||
|
||||
---
|
||||
|
|
Loading…
Reference in New Issue
Block a user