From cc5aeaed29c067f60d11e07496704406a1577a35 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 6 May 2021 10:43:03 +0200 Subject: [PATCH] Add Chinese PTB tags to glossary (#7993) --- spacy/glossary.py | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/spacy/glossary.py b/spacy/glossary.py index c4a6a5c45..0dc075ca7 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -58,7 +58,7 @@ GLOSSARY = { "FW": "foreign word", "HYPH": "punctuation mark, hyphen", "IN": "conjunction, subordinating or preposition", - "JJ": "adjective", + "JJ": "adjective (English), other noun-modifier (Chinese)", "JJR": "adjective, comparative", "JJS": "adjective, superlative", "LS": "list item marker", @@ -88,7 +88,7 @@ GLOSSARY = { "WP": "wh-pronoun, personal", "WP$": "wh-pronoun, possessive", "WRB": "wh-adverb", - "SP": "space", + "SP": "space (English), sentence-final particle (Chinese)", "ADD": "email", "NFP": "superfluous punctuation", "GW": "additional word in multi-word expression", @@ -152,6 +152,40 @@ GLOSSARY = { "VVIZU": 'infinitive with "zu", full', "VVPP": "perfect participle, full", "XY": "non-word containing non-letter", + # POS Tags (Chinese) + # OntoNotes / Chinese Penn Treebank + # https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports + "AD": "adverb", + "AS": "aspect marker", + "BA": "把 in ba-construction", + # "CD": "cardinal number", + "CS": "subordinating conjunction", + "DEC": "的 in a relative clause", + "DEG": "associative 的", + "DER": "得 in V-de const. and V-de-R", + "DEV": "地 before VP", + "ETC": "for words 等, 等等", + # "FW": "foreign words" + "IJ": "interjection", + # "JJ": "other noun-modifier", + "LB": "被 in long bei-const", + "LC": "localizer", + "M": "measure word", + "MSP": "other particle", + # "NN": "common noun", + "NR": "proper noun", + "NT": "temporal noun", + "OD": "ordinal number", + "ON": "onomatopoeia", + "P": "preposition excluding 把 and 被", + "PN": "pronoun", + "PU": "punctuation", + "SB": "被 in short bei-const", + # "SP": "sentence-final particle", + "VA": "predicative adjective", + "VC": "是 (copula)", + "VE": "有 as the main verb", + "VV": "other verb", # Noun chunks "NP": "noun phrase", "PP": "prepositional phrase",