Rename 'SP' special tag to '_SP'

Renaming the tag with an underscore lets us add it to the tag map
without worrying that we'll change the sequence of tags, which throws
off the tag-to-ID mapping. For instance, if we inserted a 'SP' tag,
the "VERB" tag is pushed to a different class ID, and the model is all
messed up.
This commit is contained in:
Matthew Honnibal 2017-10-20 14:01:12 +02:00
parent 506cf2eb13
commit 49895fbef6
5 changed files with 54 additions and 45 deletions

View File

@ -62,5 +62,5 @@ TAG_MAP = {
"VVIZU": {POS: VERB, "VerbForm": "inf"}, "VVIZU": {POS: VERB, "VerbForm": "inf"},
"VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"}, "VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
"XY": {POS: X}, "XY": {POS: X},
"SP": {POS: SPACE} "_SP": {POS: SPACE}
} }

View File

@ -55,11 +55,11 @@ TAG_MAP = {
"WP": {POS: NOUN, "PronType": "int|rel"}, "WP": {POS: NOUN, "PronType": "int|rel"},
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
"WRB": {POS: ADV, "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"},
"SP": {POS: SPACE},
"ADD": {POS: X}, "ADD": {POS: X},
"NFP": {POS: PUNCT}, "NFP": {POS: PUNCT},
"GW": {POS: X}, "GW": {POS: X},
"XX": {POS: X}, "XX": {POS: X},
"BES": {POS: VERB}, "BES": {POS: VERB},
"HVS": {POS: VERB} "HVS": {POS: VERB},
"_SP": {POS: SPACE},
} }

View File

@ -303,5 +303,5 @@ TAG_MAP = {
"VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"}, "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
"VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"}, "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
"X___": {"morph": "_", "pos": "X"}, "X___": {"morph": "_", "pos": "X"},
"SP": {"morph": "_", "pos": "SPACE"}, "_SP": {"morph": "_", "pos": "SPACE"},
} }

View File

@ -19,63 +19,64 @@ TAG_MAP = {
"NPRP": {POS: PRON}, "NPRP": {POS: PRON},
# ADJ # ADJ
"ADJ": {POS: ADJ}, "ADJ": {POS: ADJ},
"NONM": {POS: ADJ}, "NONM": {POS: ADJ},
"VATT": {POS: ADJ}, "VATT": {POS: ADJ},
"DONM": {POS: ADJ}, "DONM": {POS: ADJ},
# ADV # ADV
"ADV": {POS: ADV}, "ADV": {POS: ADV},
"ADVN": {POS: ADV}, "ADVN": {POS: ADV},
"ADVI": {POS: ADV}, "ADVI": {POS: ADV},
"ADVP": {POS: ADV}, "ADVP": {POS: ADV},
"ADVS": {POS: ADV}, "ADVS": {POS: ADV},
# INT # INT
"INT": {POS: INTJ}, "INT": {POS: INTJ},
# PRON # PRON
"PROPN": {POS: PROPN}, "PROPN": {POS: PROPN},
"PPRS": {POS: PROPN}, "PPRS": {POS: PROPN},
"PDMN": {POS: PROPN}, "PDMN": {POS: PROPN},
"PNTR": {POS: PROPN}, "PNTR": {POS: PROPN},
# DET # DET
"DET": {POS: DET}, "DET": {POS: DET},
"DDAN": {POS: DET}, "DDAN": {POS: DET},
"DDAC": {POS: DET}, "DDAC": {POS: DET},
"DDBQ": {POS: DET}, "DDBQ": {POS: DET},
"DDAQ": {POS: DET}, "DDAQ": {POS: DET},
"DIAC": {POS: DET}, "DIAC": {POS: DET},
"DIBQ": {POS: DET}, "DIBQ": {POS: DET},
"DIAQ": {POS: DET}, "DIAQ": {POS: DET},
"DCNM": {POS: DET}, "DCNM": {POS: DET},
# NUM # NUM
"NUM": {POS: NUM}, "NUM": {POS: NUM},
"NCNM": {POS: NUM}, "NCNM": {POS: NUM},
"NLBL": {POS: NUM}, "NLBL": {POS: NUM},
"DCNM": {POS: NUM}, "DCNM": {POS: NUM},
# AUX # AUX
"AUX": {POS: AUX}, "AUX": {POS: AUX},
"XVBM": {POS: AUX}, "XVBM": {POS: AUX},
"XVAM": {POS: AUX}, "XVAM": {POS: AUX},
"XVMM": {POS: AUX}, "XVMM": {POS: AUX},
"XVBB": {POS: AUX}, "XVBB": {POS: AUX},
"XVAE": {POS: AUX}, "XVAE": {POS: AUX},
# ADP # ADP
"ADP": {POS: ADP}, "ADP": {POS: ADP},
"RPRE": {POS: ADP}, "RPRE": {POS: ADP},
# CCONJ # CCONJ
"CCONJ": {POS: CCONJ}, "CCONJ": {POS: CCONJ},
"JCRG": {POS: CCONJ}, "JCRG": {POS: CCONJ},
# SCONJ # SCONJ
"SCONJ": {POS: SCONJ}, "SCONJ": {POS: SCONJ},
"PREL": {POS: SCONJ}, "PREL": {POS: SCONJ},
"JSBR": {POS: SCONJ}, "JSBR": {POS: SCONJ},
"JCMP": {POS: SCONJ}, "JCMP": {POS: SCONJ},
# PART # PART
"PART": {POS: PART}, "PART": {POS: PART},
"FIXN": {POS: PART}, "FIXN": {POS: PART},
"FIXV": {POS: PART}, "FIXV": {POS: PART},
"EAFF": {POS: PART}, "EAFF": {POS: PART},
"AITT": {POS: PART}, "AITT": {POS: PART},
"NEG": {POS: PART}, "NEG": {POS: PART},
# PUNCT # PUNCT
"PUNCT": {POS: PUNCT}, "PUNCT": {POS: PUNCT},
"PUNC": {POS: PUNCT} "PUNC": {POS: PUNCT},
"_SP": {POS: SPACE}
} }

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
from libc.string cimport memset from libc.string cimport memset
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
from .attrs cimport POS, IS_SPACE from .attrs cimport POS, IS_SPACE
from .parts_of_speech import IDS as POS_IDS from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
@ -36,14 +36,22 @@ cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
self.mem = Pool() self.mem = Pool()
self.strings = string_store self.strings = string_store
# Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end.
space_attrs = tag_map.pop('SP', {POS: SPACE})
if '_SP' not in tag_map:
self.strings.add('_SP')
tag_map = dict(tag_map)
tag_map['_SP'] = space_attrs
self.tag_names = tuple(sorted(tag_map.keys()))
self.tag_map = {} self.tag_map = {}
self.lemmatizer = lemmatizer self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) self.n_tags = len(tag_map)
self.tag_names = tuple(sorted(tag_map.keys()))
self.reverse_index = {} self.reverse_index = {}
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
self.strings.add(tag_str)
self.tag_map[tag_str] = dict(attrs) self.tag_map[tag_str] = dict(attrs)
attrs = _normalize_props(attrs) attrs = _normalize_props(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
@ -93,7 +101,7 @@ cdef class Morphology:
# the statistical model fails. # the statistical model fails.
# Related to Issue #220 # Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE): if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('SP')] tag_id = self.reverse_index[self.strings.add('_SP')]
rich_tag = self.rich_tags[tag_id] rich_tag = self.rich_tags[tag_id]
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth) analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL: if analysis is NULL: