Update tag maps and docs for English and German (#4501)

* Update English tag_map Update English tag_map based on this conversion table: https://universaldependencies.org/tagset-conversion/en-penn-uposf.html * Update German tag_map Update German tag_map based on this conversion table: https://universaldependencies.org/tagset-conversion/de-stts-uposf.html * Add missing Tiger dependencies to glossary * Add quotes to definition of TO * Update POS/TAG tables in docs Update POS/TAG tables for English and German docs using current information generated from the tag_maps and GLOSSARY. * Update warning that -PRON- is specific to English * Revert docs to default JSON output with convert * Revert "Revert docs to default JSON output with convert" This reverts commit 6b78c048f1.
2025-08-24 14:04:56 +03:00 · 2019-10-24 12:56:05 +02:00 · 2019-10-24 12:56:05 +02:00 · 1b0bbe4b76
commit 1b0bbe4b76
parent 10d88b09bb
4 changed files with 96 additions and 95 deletions
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -80,7 +80,7 @@ GLOSSARY = {
    "RBR": "adverb, comparative",
    "RBS": "adverb, superlative",
    "RP": "adverb, particle",
-    "TO": "infinitival to",
+    "TO": 'infinitival "to"',
    "UH": "interjection",
    "VB": "verb, base form",
    "VBD": "verb, past tense",
@ -279,6 +279,12 @@ GLOSSARY = {
    "re": "repeated element",
    "rs": "reported speech",
    "sb": "subject",
+    "sb": "subject",
+    "sbp": "passivized subject (PP)",
+    "sp": "subject or predicate",
+    "svp": "separable verb prefix",
+    "uc": "unit component",
+    "vo": "vocative",
    # Named Entity Recognition
    # OntoNotes 5
    # https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
--- a/spacy/lang/de/tag_map.py
+++ b/spacy/lang/de/tag_map.py
@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
-from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
+from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X
+from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, VERB


 TAG_MAP = {
@ -20,8 +20,8 @@ TAG_MAP = {
    "CARD": {POS: NUM, "NumType": "card"},
    "FM": {POS: X, "Foreign": "yes"},
    "ITJ": {POS: INTJ},
-    "KOKOM": {POS: CONJ, "ConjType": "comp"},
-    "KON": {POS: CONJ},
+    "KOKOM": {POS: CCONJ, "ConjType": "comp"},
+    "KON": {POS: CCONJ},
    "KOUI": {POS: SCONJ},
    "KOUS": {POS: SCONJ},
    "NE": {POS: PROPN},
@ -43,7 +43,7 @@ TAG_MAP = {
    "PTKA": {POS: PART},
    "PTKANT": {POS: PART, "PartType": "res"},
    "PTKNEG": {POS: PART, "Polarity": "neg"},
-    "PTKVZ": {POS: PART, "PartType": "vbp"},
+    "PTKVZ": {POS: ADP, "PartType": "vbp"},
    "PTKZU": {POS: PART, "PartType": "inf"},
    "PWAT": {POS: DET, "PronType": "int"},
    "PWAV": {POS: ADV, "PronType": "int"},
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@ -28,8 +28,8 @@ TAG_MAP = {
    "JJR": {POS: ADJ, "Degree": "comp"},
    "JJS": {POS: ADJ, "Degree": "sup"},
    "LS": {POS: X, "NumType": "ord"},
-    "MD": {POS: AUX, "VerbType": "mod"},
-    "NIL": {POS: ""},
+    "MD": {POS: VERB, "VerbType": "mod"},
+    "NIL": {POS: X},
    "NN": {POS: NOUN, "Number": "sing"},
    "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
    "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
@ -37,7 +37,7 @@ TAG_MAP = {
    "PDT": {POS: DET},
    "POS": {POS: PART, "Poss": "yes"},
    "PRP": {POS: PRON, "PronType": "prs"},
-    "PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"},
+    "PRP$": {POS: DET, "PronType": "prs", "Poss": "yes"},
    "RB": {POS: ADV, "Degree": "pos"},
    "RBR": {POS: ADV, "Degree": "comp"},
    "RBS": {POS: ADV, "Degree": "sup"},
@ -58,9 +58,9 @@ TAG_MAP = {
        "Number": "sing",
        "Person": "three",
    },
-    "WDT": {POS: PRON},
+    "WDT": {POS: DET},
    "WP": {POS: PRON},
-    "WP$": {POS: PRON, "Poss": "yes"},
+    "WP$": {POS: DET, "Poss": "yes"},
    "WRB": {POS: ADV},
    "ADD": {POS: X},
    "NFP": {POS: PUNCT},
--- a/website/docs/api/annotation.md
+++ b/website/docs/api/annotation.md
@ -48,14 +48,14 @@ be installed if needed via `pip install spacy[lookups]`. Some languages provide
 full lemmatization rules and exceptions, while other languages currently only
 rely on simple lookup tables.

-<Infobox title="About spaCy's custom pronoun lemma" variant="warning">
+<Infobox title="About spaCy's custom pronoun lemma for English" variant="warning">

-spaCy adds a **special case for pronouns**: all pronouns are lemmatized to the
-special token `-PRON-`. Unlike verbs and common nouns, there's no clear base
-form of a personal pronoun. Should the lemma of "me" be "I", or should we
-normalize person as well, giving "it" — or maybe "he"? spaCy's solution is to
-introduce a novel symbol, `-PRON-`, which is used as the lemma for all personal
-pronouns.
+spaCy adds a **special case for English pronouns**: all English pronouns are
+lemmatized to the special token `-PRON-`. Unlike verbs and common nouns,
+there's no clear base form of a personal pronoun. Should the lemma of "me" be
+"I", or should we normalize person as well, giving "it" — or maybe "he"?
+spaCy's solution is to introduce a novel symbol, `-PRON-`, which is used as the
+lemma for all personal pronouns.

 </Infobox>

@ -117,76 +117,72 @@ type. They're available as the [`Token.pos`](/api/token#attributes) and

 The English part-of-speech tagger uses the
 [OntoNotes 5](https://catalog.ldc.upenn.edu/LDC2013T19) version of the Penn
-Treebank tag set. We also map the tags to the simpler Google Universal POS tag
-set.
-
-| Tag                                 |  POS    | Morphology                                     | Description                               |
-| ----------------------------------- | ------- | ---------------------------------------------- | ----------------------------------------- |
-| `-LRB-`                             | `PUNCT` | `PunctType=brck PunctSide=ini`                 | left round bracket                        |
-| `-RRB-`                             | `PUNCT` | `PunctType=brck PunctSide=fin`                 | right round bracket                       |
-| `,`                                 | `PUNCT` | `PunctType=comm`                               | punctuation mark, comma                   |
-| `:`                                 | `PUNCT` |                                                | punctuation mark, colon or ellipsis       |
-| `.`                                 | `PUNCT` | `PunctType=peri`                               | punctuation mark, sentence closer         |
-| `''`                                | `PUNCT` | `PunctType=quot PunctSide=fin`                 | closing quotation mark                    |
-| `""`                                | `PUNCT` | `PunctType=quot PunctSide=fin`                 | closing quotation mark                    |
-| <InlineCode>&#96;&#96;</InlineCode> | `PUNCT` | `PunctType=quot PunctSide=ini`                 | opening quotation mark                    |
-| `#`                                 | `SYM`   | `SymType=numbersign`                           | symbol, number sign                       |
-| `$`                                 | `SYM`   | `SymType=currency`                             | symbol, currency                          |
-| `ADD`                               | `X`     |                                                | email                                     |
-| `AFX`                               | `ADJ`   | `Hyph=yes`                                     | affix                                     |
-| `BES`                               | `VERB`  |                                                | auxiliary "be"                            |
-| `CC`                                | `CONJ`  | `ConjType=coor`                                | conjunction, coordinating                 |
-| `CD`                                | `NUM`   | `NumType=card`                                 | cardinal number                           |
-| `DT`                                | `DET`   |                                                | determiner                                |
-| `EX`                                | `ADV`   | `AdvType=ex`                                   | existential there                         |
-| `FW`                                | `X`     | `Foreign=yes`                                  | foreign word                              |
-| `GW`                                | `X`     |                                                | additional word in multi-word expression  |
-| `HVS`                               | `VERB`  |                                                | forms of "have"                           |
-| `HYPH`                              | `PUNCT` | `PunctType=dash`                               | punctuation mark, hyphen                  |
-| `IN`                                | `ADP`   |                                                | conjunction, subordinating or preposition |
-| `JJ`                                | `ADJ`   | `Degree=pos`                                   | adjective                                 |
-| `JJR`                               | `ADJ`   | `Degree=comp`                                  | adjective, comparative                    |
-| `JJS`                               | `ADJ`   | `Degree=sup`                                   | adjective, superlative                    |
-| `LS`                                | `PUNCT` | `NumType=ord`                                  | list item marker                          |
-| `MD`                                | `VERB`  | `VerbType=mod`                                 | verb, modal auxiliary                     |
-| `NFP`                               | `PUNCT` |                                                | superfluous punctuation                   |
-| `NIL`                               |         |                                                | missing tag                               |
-| `NN`                                | `NOUN`  | `Number=sing`                                  | noun, singular or mass                    |
-| `NNP`                               | `PROPN` | `NounType=prop Number=sign`                    | noun, proper singular                     |
-| `NNPS`                              | `PROPN` | `NounType=prop Number=plur`                    | noun, proper plural                       |
-| `NNS`                               | `NOUN`  | `Number=plur`                                  | noun, plural                              |
-| `PDT`                               | `ADJ`   | `AdjType=pdt PronType=prn`                     | predeterminer                             |
-| `POS`                               | `PART`  | `Poss=yes`                                     | possessive ending                         |
-| `PRP`                               | `PRON`  | `PronType=prs`                                 | pronoun, personal                         |
-| `PRP$`                              | `ADJ`   | `PronType=prs Poss=yes`                        | pronoun, possessive                       |
-| `RB`                                | `ADV`   | `Degree=pos`                                   | adverb                                    |
-| `RBR`                               | `ADV`   | `Degree=comp`                                  | adverb, comparative                       |
-| `RBS`                               | `ADV`   | `Degree=sup`                                   | adverb, superlative                       |
-| `RP`                                | `PART`  |                                                | adverb, particle                          |
-| `_SP`                               | `SPACE` |                                                | space                                     |
-| `SYM`                               | `SYM`   |                                                | symbol                                    |
-| `TO`                                | `PART`  | `PartType=inf VerbForm=inf`                    | infinitival "to"                          |
-| `UH`                                | `INTJ`  |                                                | interjection                              |
-| `VB`                                | `VERB`  | `VerbForm=inf`                                 | verb, base form                           |
-| `VBD`                               | `VERB`  | `VerbForm=fin Tense=past`                      | verb, past tense                          |
-| `VBG`                               | `VERB`  | `VerbForm=part Tense=pres Aspect=prog`         | verb, gerund or present participle        |
-| `VBN`                               | `VERB`  | `VerbForm=part Tense=past Aspect=perf`         | verb, past participle                     |
-| `VBP`                               | `VERB`  | `VerbForm=fin Tense=pres`                      | verb, non-3rd person singular present     |
-| `VBZ`                               | `VERB`  | `VerbForm=fin Tense=pres Number=sing Person=3` | verb, 3rd person singular present         |
-| `WDT`                               | `ADJ`   | `PronType=int|rel`                             | wh-determiner                             |
-| `WP`                                | `NOUN`  | `PronType=int|rel`                             | wh-pronoun, personal                      |
-| `WP$`                               | `ADJ`   | `Poss=yes PronType=int|rel`                    | wh-pronoun, possessive                    |
-| `WRB`                               | `ADV`   | `PronType=int|rel`                             | wh-adverb                                 |
-| `XX`                                | `X`     |                                                | unknown                                   |
+Treebank tag set. We also map the tags to the simpler Universal Dependencies v2
+POS tag set.

+| Tag                                   |  POS    | Morphology                              | Description                               |
+| ------------------------------------- | ------- | --------------------------------------- | ----------------------------------------- |
+| `$`                                   | `SYM`   |                                          | symbol, currency                          |
+| <InlineCode>&#96;&#96;</InlineCode>   | `PUNCT` | `PunctType=quot PunctSide=ini`           | opening quotation mark                    |
+| `''`                                  | `PUNCT` | `PunctType=quot PunctSide=fin`           | closing quotation mark                    |
+| `,`                                   | `PUNCT` | `PunctType=comm`                         | punctuation mark, comma                   |
+| `-LRB-`                               | `PUNCT` | `PunctType=brck PunctSide=ini`           | left round bracket                        |
+| `-RRB-`                               | `PUNCT` | `PunctType=brck PunctSide=fin`           | right round bracket                       |
+| `.`                                   | `PUNCT` | `PunctType=peri`                         | punctuation mark, sentence closer         |
+| `:`                                   | `PUNCT` |                                          | punctuation mark, colon or ellipsis       |
+| `ADD`                                 | `X`     |                                          | email                                     |
+| `AFX`                                 | `ADJ`   | `Hyph=yes`                               | affix                                     |
+| `CC`                                  | `CCONJ` | `ConjType=comp`                          | conjunction, coordinating                 |
+| `CD`                                  | `NUM`   | `NumType=card`                           | cardinal number                           |
+| `DT`                                  | `DET`   |                                          | determiner                                |
+| `EX`                                  | `PRON`  | `AdvType=ex`                             | existential there                         |
+| `FW`                                  | `X`     | `Foreign=yes`                            | foreign word                              |
+| `GW`                                  | `X`     |                                          | additional word in multi-word expression  |
+| `HYPH`                                | `PUNCT` | `PunctType=dash`                         | punctuation mark, hyphen                  |
+| `IN`                                  | `ADP`   |                                          | conjunction, subordinating or preposition |
+| `JJ`                                  | `ADJ`   | `Degree=pos`                             | adjective                                 |
+| `JJR`                                 | `ADJ`   | `Degree=comp`                            | adjective, comparative                    |
+| `JJS`                                 | `ADJ`   | `Degree=sup`                             | adjective, superlative                    |
+| `LS`                                  | `X`     | `NumType=ord`                            | list item marker                          |
+| `MD`                                  | `VERB`  | `VerbType=mod`                           | verb, modal auxiliary                     |
+| `NFP`                                 | `PUNCT` |                                          | superfluous punctuation                   |
+| `NIL`                                 | `X`     |                                          | missing tag                               |
+| `NN`                                  | `NOUN`  | `Number=sing`                            | noun, singular or mass                    |
+| `NNP`                                 | `PROPN` | `NounType=prop Number=sing`              | noun, proper singular                     |
+| `NNPS`                                | `PROPN` | `NounType=prop Number=plur`              | noun, proper plural                       |
+| `NNS`                                 | `NOUN`  | `Number=plur`                            | noun, plural                              |
+| `PDT`                                 | `DET`   |                                          | predeterminer                             |
+| `POS`                                 | `PART`  | `Poss=yes`                               | possessive ending                         |
+| `PRP`                                 | `PRON`  | `PronType=prs`                           | pronoun, personal                         |
+| `PRP$`                                | `DET`   | `PronType=prs Poss=yes`                  | pronoun, possessive                       |
+| `RB`                                  | `ADV`   | `Degree=pos`                             | adverb                                    |
+| `RBR`                                 | `ADV`   | `Degree=comp`                            | adverb, comparative                       |
+| `RBS`                                 | `ADV`   | `Degree=sup`                             | adverb, superlative                       |
+| `RP`                                  | `ADP`   |                                          | adverb, particle                          |
+| `SP`                                  | `SPACE` |                                          | space                                     |
+| `SYM`                                 | `SYM`   |                                          | symbol                                    |
+| `TO`                                  | `PART`  | `PartType=inf VerbForm=inf`              | infinitival "to"                          |
+| `UH`                                  | `INTJ`  |                                          | interjection                              |
+| `VB`                                  | `VERB`  | `VerbForm=inf`                           | verb, base form                           |
+| `VBD`                                 | `VERB`  | `VerbForm=fin Tense=past`                | verb, past tense                          |
+| `VBG`                                 | `VERB`  | `VerbForm=part Tense=pres Aspect=prog`   | verb, gerund or present participle        |
+| `VBN`                                 | `VERB`  | `VerbForm=part Tense=past Aspect=perf`   | verb, past participle                     |
+| `VBP`                                 | `VERB`  | `VerbForm=fin Tense=pres`                | verb, non-3rd person singular present     |
+| `VBZ`                                 | `VERB`  | `VerbForm=fin Tense=pres Number=sing Person=three` | verb, 3rd person singular present         |
+| `WDT`                                 | `DET`   |                                          | wh-determiner                             |
+| `WP`                                  | `PRON`  |                                          | wh-pronoun, personal                      |
+| `WP$`                                 | `DET`   | `Poss=yes`                               | wh-pronoun, possessive                    |
+| `WRB`                                 | `ADV`   |                                          | wh-adverb                                 |
+| `XX`                                  | `X`     |                                          | unknown                                   |
+| `_SP`                                 | `SPACE` |                                          |                                           |
 </Accordion>

 <Accordion title="German" id="pos-de">

 The German part-of-speech tagger uses the
 [TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html)
-annotation scheme. We also map the tags to the simpler Google Universal POS tag
-set.
+annotation scheme. We also map the tags to the simpler Universal Dependencies
+v2 POS tag set.

 | Tag       |  POS    | Morphology                               | Description                                       |
 | --------- | ------- | ---------------------------------------- | ------------------------------------------------- |
@ -194,7 +190,7 @@ set.
 | `$,`      | `PUNCT` | `PunctType=comm`                         | comma                                             |
 | `$.`      | `PUNCT` | `PunctType=peri`                         | sentence-final punctuation mark                   |
 | `ADJA`    | `ADJ`   |                                          | adjective, attributive                            |
-| `ADJD`    | `ADJ`   | `Variant=short`                          | adjective, adverbial or predicative               |
+| `ADJD`    | `ADJ`   |                                          | adjective, adverbial or predicative               |
 | `ADV`     | `ADV`   |                                          | adverb                                            |
 | `APPO`    | `ADP`   | `AdpType=post`                           | postposition                                      |
 | `APPR`    | `ADP`   | `AdpType=prep`                           | preposition; circumposition left                  |
@ -204,28 +200,28 @@ set.
 | `CARD`    | `NUM`   | `NumType=card`                           | cardinal number                                   |
 | `FM`      | `X`     | `Foreign=yes`                            | foreign language material                         |
 | `ITJ`     | `INTJ`  |                                          | interjection                                      |
-| `KOKOM`   | `CONJ`  | `ConjType=comp`                          | comparative conjunction                           |
-| `KON`     | `CONJ`  |                                          | coordinate conjunction                            |
+| `KOKOM`   | `CCONJ` | `ConjType=comp`                          | comparative conjunction                           |
+| `KON`     | `CCONJ` |                                          | coordinate conjunction                            |
 | `KOUI`    | `SCONJ` |                                          | subordinate conjunction with "zu" and infinitive  |
 | `KOUS`    | `SCONJ` |                                          | subordinate conjunction with sentence             |
 | `NE`      | `PROPN` |                                          | proper noun                                       |
-| `NNE`     | `PROPN` |                                          | proper noun                                       |
 | `NN`      | `NOUN`  |                                          | noun, singular or mass                            |
-| `PROAV`   | `ADV`   | `PronType=dem`                           | pronominal adverb                                 |
+| `NNE`     | `PROPN` |                                          | proper noun                                       |
 | `PDAT`    | `DET`   | `PronType=dem`                           | attributive demonstrative pronoun                 |
 | `PDS`     | `PRON`  | `PronType=dem`                           | substituting demonstrative pronoun                |
-| `PIAT`    | `DET`   | `PronType=ind\|neg\|tot`                 | attributive indefinite pronoun without determiner |
-| `PIS`     | `PRON`  | `PronType=ind\|neg\|tot`                 | substituting indefinite pronoun                   |
+| `PIAT`    | `DET`   | `PronType=ind|neg|tot`                   | attributive indefinite pronoun without determiner |
+| `PIS`     | `PRON`  | `PronType=ind|neg|tot`                   | substituting indefinite pronoun                   |
 | `PPER`    | `PRON`  | `PronType=prs`                           | non-reflexive personal pronoun                    |
 | `PPOSAT`  | `DET`   | `Poss=yes PronType=prs`                  | attributive possessive pronoun                    |
-| `PPOSS`   | `PRON`  | `PronType=rel`                           | substituting possessive pronoun                   |
+| `PPOSS`   | `PRON`  | `Poss=yes PronType=prs`                  | substituting possessive pronoun                   |
 | `PRELAT`  | `DET`   | `PronType=rel`                           | attributive relative pronoun                      |
 | `PRELS`   | `PRON`  | `PronType=rel`                           | substituting relative pronoun                     |
 | `PRF`     | `PRON`  | `PronType=prs Reflex=yes`                | reflexive personal pronoun                        |
+| `PROAV`   | `ADV`   | `PronType=dem`                           | pronominal adverb                                 |
 | `PTKA`    | `PART`  |                                          | particle with adjective or adverb                 |
 | `PTKANT`  | `PART`  | `PartType=res`                           | answer particle                                   |
-| `PTKNEG`  | `PART`  | `Negative=yes`                           | negative particle                                 |
-| `PTKVZ`   | `PART`  | `PartType=vbp`                           | separable verbal particle                         |
+| `PTKNEG`  | `PART`  | `Polarity=neg`                           | negative particle                                 |
+| `PTKVZ`   | `ADP`   | `PartType=vbp`                           | separable verbal particle                         |
 | `PTKZU`   | `PART`  | `PartType=inf`                           | "zu" before infinitive                            |
 | `PWAT`    | `DET`   | `PronType=int`                           | attributive interrogative pronoun                 |
 | `PWAV`    | `ADV`   | `PronType=int`                           | adverbial interrogative or relative pronoun       |
@ -234,9 +230,9 @@ set.
 | `VAFIN`   | `AUX`   | `Mood=ind VerbForm=fin`                  | finite verb, auxiliary                            |
 | `VAIMP`   | `AUX`   | `Mood=imp VerbForm=fin`                  | imperative, auxiliary                             |
 | `VAINF`   | `AUX`   | `VerbForm=inf`                           | infinitive, auxiliary                             |
-| `VAPP`    | `AUX`   | `Aspect=perf VerbForm=fin`               | perfect participle, auxiliary                     |
+| `VAPP`    | `AUX`   | `Aspect=perf VerbForm=part`              | perfect participle, auxiliary                     |
 | `VMFIN`   | `VERB`  | `Mood=ind VerbForm=fin VerbType=mod`     | finite verb, modal                                |
-| `VMINF`   | `VERB`  | `VerbForm=fin VerbType=mod`              | infinitive, modal                                 |
+| `VMINF`   | `VERB`  | `VerbForm=inf VerbType=mod`              | infinitive, modal                                 |
 | `VMPP`    | `VERB`  | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal                         |
 | `VVFIN`   | `VERB`  | `Mood=ind VerbForm=fin`                  | finite verb, full                                 |
 | `VVIMP`   | `VERB`  | `Mood=imp VerbForm=fin`                  | imperative, full                                  |
@ -244,8 +240,7 @@ set.
 | `VVIZU`   | `VERB`  | `VerbForm=inf`                           | infinitive with "zu", full                        |
 | `VVPP`    | `VERB`  | `Aspect=perf VerbForm=part`              | perfect participle, full                          |
 | `XY`      | `X`     |                                          | non-word containing non-letter                    |
-| `SP`      | `SPACE` |                                          | space                                             |
-
+| `_SP`     | `SPACE` |                                          |                                                   |
 </Accordion>

 ---