From 49895fbef69598d18fd00197661ec3ad939de849 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 14:01:12 +0200 Subject: [PATCH] Rename 'SP' special tag to '_SP' Renaming the tag with an underscore lets us add it to the tag map without worrying that we'll change the sequence of tags, which throws off the tag-to-ID mapping. For instance, if we inserted a 'SP' tag, the "VERB" tag is pushed to a different class ID, and the model is all messed up. --- spacy/lang/de/tag_map.py | 2 +- spacy/lang/en/tag_map.py | 4 +-- spacy/lang/es/tag_map.py | 2 +- spacy/lang/th/tag_map.py | 77 ++++++++++++++++++++-------------------- spacy/morphology.pyx | 14 ++++++-- 5 files changed, 54 insertions(+), 45 deletions(-) diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py index d16bd17e0..730c15cfc 100644 --- a/spacy/lang/de/tag_map.py +++ b/spacy/lang/de/tag_map.py @@ -62,5 +62,5 @@ TAG_MAP = { "VVIZU": {POS: VERB, "VerbForm": "inf"}, "VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"}, "XY": {POS: X}, - "SP": {POS: SPACE} + "_SP": {POS: SPACE} } diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index a674c17e3..76eabf307 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -55,11 +55,11 @@ TAG_MAP = { "WP": {POS: NOUN, "PronType": "int|rel"}, "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"}, - "SP": {POS: SPACE}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, "GW": {POS: X}, "XX": {POS: X}, "BES": {POS: VERB}, - "HVS": {POS: VERB} + "HVS": {POS: VERB}, + "_SP": {POS: SPACE}, } diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py index 86dd48620..2095d23b1 100644 --- a/spacy/lang/es/tag_map.py +++ b/spacy/lang/es/tag_map.py @@ -303,5 +303,5 @@ TAG_MAP = { "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"}, "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"}, "X___": {"morph": "_", "pos": "X"}, - "SP": {"morph": "_", "pos": "SPACE"}, + "_SP": {"morph": "_", "pos": "SPACE"}, } diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py index 40e5ac44c..570871820 100644 --- a/spacy/lang/th/tag_map.py +++ b/spacy/lang/th/tag_map.py @@ -19,63 +19,64 @@ TAG_MAP = { "NPRP": {POS: PRON}, # ADJ "ADJ": {POS: ADJ}, - "NONM": {POS: ADJ}, - "VATT": {POS: ADJ}, - "DONM": {POS: ADJ}, + "NONM": {POS: ADJ}, + "VATT": {POS: ADJ}, + "DONM": {POS: ADJ}, # ADV "ADV": {POS: ADV}, - "ADVN": {POS: ADV}, - "ADVI": {POS: ADV}, - "ADVP": {POS: ADV}, - "ADVS": {POS: ADV}, + "ADVN": {POS: ADV}, + "ADVI": {POS: ADV}, + "ADVP": {POS: ADV}, + "ADVS": {POS: ADV}, # INT "INT": {POS: INTJ}, # PRON "PROPN": {POS: PROPN}, - "PPRS": {POS: PROPN}, - "PDMN": {POS: PROPN}, - "PNTR": {POS: PROPN}, + "PPRS": {POS: PROPN}, + "PDMN": {POS: PROPN}, + "PNTR": {POS: PROPN}, # DET "DET": {POS: DET}, - "DDAN": {POS: DET}, - "DDAC": {POS: DET}, - "DDBQ": {POS: DET}, - "DDAQ": {POS: DET}, - "DIAC": {POS: DET}, - "DIBQ": {POS: DET}, - "DIAQ": {POS: DET}, - "DCNM": {POS: DET}, + "DDAN": {POS: DET}, + "DDAC": {POS: DET}, + "DDBQ": {POS: DET}, + "DDAQ": {POS: DET}, + "DIAC": {POS: DET}, + "DIBQ": {POS: DET}, + "DIAQ": {POS: DET}, + "DCNM": {POS: DET}, # NUM "NUM": {POS: NUM}, - "NCNM": {POS: NUM}, - "NLBL": {POS: NUM}, - "DCNM": {POS: NUM}, + "NCNM": {POS: NUM}, + "NLBL": {POS: NUM}, + "DCNM": {POS: NUM}, # AUX "AUX": {POS: AUX}, - "XVBM": {POS: AUX}, - "XVAM": {POS: AUX}, - "XVMM": {POS: AUX}, - "XVBB": {POS: AUX}, - "XVAE": {POS: AUX}, + "XVBM": {POS: AUX}, + "XVAM": {POS: AUX}, + "XVMM": {POS: AUX}, + "XVBB": {POS: AUX}, + "XVAE": {POS: AUX}, # ADP "ADP": {POS: ADP}, - "RPRE": {POS: ADP}, + "RPRE": {POS: ADP}, # CCONJ "CCONJ": {POS: CCONJ}, - "JCRG": {POS: CCONJ}, + "JCRG": {POS: CCONJ}, # SCONJ "SCONJ": {POS: SCONJ}, - "PREL": {POS: SCONJ}, - "JSBR": {POS: SCONJ}, - "JCMP": {POS: SCONJ}, + "PREL": {POS: SCONJ}, + "JSBR": {POS: SCONJ}, + "JCMP": {POS: SCONJ}, # PART - "PART": {POS: PART}, - "FIXN": {POS: PART}, - "FIXV": {POS: PART}, - "EAFF": {POS: PART}, - "AITT": {POS: PART}, - "NEG": {POS: PART}, + "PART": {POS: PART}, + "FIXN": {POS: PART}, + "FIXV": {POS: PART}, + "EAFF": {POS: PART}, + "AITT": {POS: PART}, + "NEG": {POS: PART}, # PUNCT "PUNCT": {POS: PUNCT}, - "PUNC": {POS: PUNCT} + "PUNC": {POS: PUNCT}, + "_SP": {POS: SPACE} } diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 65b46fe08..7845ab4e7 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -4,7 +4,7 @@ from __future__ import unicode_literals from libc.string cimport memset -from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT +from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE from .attrs cimport POS, IS_SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme @@ -36,14 +36,22 @@ cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): self.mem = Pool() self.strings = string_store + # Add special space symbol. We prefix with underscore, to make sure it + # always sorts to the end. + space_attrs = tag_map.pop('SP', {POS: SPACE}) + if '_SP' not in tag_map: + self.strings.add('_SP') + tag_map = dict(tag_map) + tag_map['_SP'] = space_attrs + self.tag_names = tuple(sorted(tag_map.keys())) self.tag_map = {} self.lemmatizer = lemmatizer self.n_tags = len(tag_map) - self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): + self.strings.add(tag_str) self.tag_map[tag_str] = dict(attrs) attrs = _normalize_props(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) @@ -93,7 +101,7 @@ cdef class Morphology: # the statistical model fails. # Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): - tag_id = self.reverse_index[self.strings.add('SP')] + tag_id = self.reverse_index[self.strings.add('_SP')] rich_tag = self.rich_tags[tag_id] analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: