mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Rename 'SP' special tag to '_SP'
Renaming the tag with an underscore lets us add it to the tag map without worrying that we'll change the sequence of tags, which throws off the tag-to-ID mapping. For instance, if we inserted a 'SP' tag, the "VERB" tag is pushed to a different class ID, and the model is all messed up.
This commit is contained in:
parent
506cf2eb13
commit
49895fbef6
|
@ -62,5 +62,5 @@ TAG_MAP = {
|
|||
"VVIZU": {POS: VERB, "VerbForm": "inf"},
|
||||
"VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
|
||||
"XY": {POS: X},
|
||||
"SP": {POS: SPACE}
|
||||
"_SP": {POS: SPACE}
|
||||
}
|
||||
|
|
|
@ -55,11 +55,11 @@ TAG_MAP = {
|
|||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||
"SP": {POS: SPACE},
|
||||
"ADD": {POS: X},
|
||||
"NFP": {POS: PUNCT},
|
||||
"GW": {POS: X},
|
||||
"XX": {POS: X},
|
||||
"BES": {POS: VERB},
|
||||
"HVS": {POS: VERB}
|
||||
"HVS": {POS: VERB},
|
||||
"_SP": {POS: SPACE},
|
||||
}
|
||||
|
|
|
@ -303,5 +303,5 @@ TAG_MAP = {
|
|||
"VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
|
||||
"VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
|
||||
"X___": {"morph": "_", "pos": "X"},
|
||||
"SP": {"morph": "_", "pos": "SPACE"},
|
||||
"_SP": {"morph": "_", "pos": "SPACE"},
|
||||
}
|
||||
|
|
|
@ -19,63 +19,64 @@ TAG_MAP = {
|
|||
"NPRP": {POS: PRON},
|
||||
# ADJ
|
||||
"ADJ": {POS: ADJ},
|
||||
"NONM": {POS: ADJ},
|
||||
"VATT": {POS: ADJ},
|
||||
"DONM": {POS: ADJ},
|
||||
"NONM": {POS: ADJ},
|
||||
"VATT": {POS: ADJ},
|
||||
"DONM": {POS: ADJ},
|
||||
# ADV
|
||||
"ADV": {POS: ADV},
|
||||
"ADVN": {POS: ADV},
|
||||
"ADVI": {POS: ADV},
|
||||
"ADVP": {POS: ADV},
|
||||
"ADVS": {POS: ADV},
|
||||
"ADVN": {POS: ADV},
|
||||
"ADVI": {POS: ADV},
|
||||
"ADVP": {POS: ADV},
|
||||
"ADVS": {POS: ADV},
|
||||
# INT
|
||||
"INT": {POS: INTJ},
|
||||
# PRON
|
||||
"PROPN": {POS: PROPN},
|
||||
"PPRS": {POS: PROPN},
|
||||
"PDMN": {POS: PROPN},
|
||||
"PNTR": {POS: PROPN},
|
||||
"PPRS": {POS: PROPN},
|
||||
"PDMN": {POS: PROPN},
|
||||
"PNTR": {POS: PROPN},
|
||||
# DET
|
||||
"DET": {POS: DET},
|
||||
"DDAN": {POS: DET},
|
||||
"DDAC": {POS: DET},
|
||||
"DDBQ": {POS: DET},
|
||||
"DDAQ": {POS: DET},
|
||||
"DIAC": {POS: DET},
|
||||
"DIBQ": {POS: DET},
|
||||
"DIAQ": {POS: DET},
|
||||
"DCNM": {POS: DET},
|
||||
"DDAN": {POS: DET},
|
||||
"DDAC": {POS: DET},
|
||||
"DDBQ": {POS: DET},
|
||||
"DDAQ": {POS: DET},
|
||||
"DIAC": {POS: DET},
|
||||
"DIBQ": {POS: DET},
|
||||
"DIAQ": {POS: DET},
|
||||
"DCNM": {POS: DET},
|
||||
# NUM
|
||||
"NUM": {POS: NUM},
|
||||
"NCNM": {POS: NUM},
|
||||
"NLBL": {POS: NUM},
|
||||
"DCNM": {POS: NUM},
|
||||
"NCNM": {POS: NUM},
|
||||
"NLBL": {POS: NUM},
|
||||
"DCNM": {POS: NUM},
|
||||
# AUX
|
||||
"AUX": {POS: AUX},
|
||||
"XVBM": {POS: AUX},
|
||||
"XVAM": {POS: AUX},
|
||||
"XVMM": {POS: AUX},
|
||||
"XVBB": {POS: AUX},
|
||||
"XVAE": {POS: AUX},
|
||||
"XVBM": {POS: AUX},
|
||||
"XVAM": {POS: AUX},
|
||||
"XVMM": {POS: AUX},
|
||||
"XVBB": {POS: AUX},
|
||||
"XVAE": {POS: AUX},
|
||||
# ADP
|
||||
"ADP": {POS: ADP},
|
||||
"RPRE": {POS: ADP},
|
||||
"RPRE": {POS: ADP},
|
||||
# CCONJ
|
||||
"CCONJ": {POS: CCONJ},
|
||||
"JCRG": {POS: CCONJ},
|
||||
"JCRG": {POS: CCONJ},
|
||||
# SCONJ
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"PREL": {POS: SCONJ},
|
||||
"JSBR": {POS: SCONJ},
|
||||
"JCMP": {POS: SCONJ},
|
||||
"PREL": {POS: SCONJ},
|
||||
"JSBR": {POS: SCONJ},
|
||||
"JCMP": {POS: SCONJ},
|
||||
# PART
|
||||
"PART": {POS: PART},
|
||||
"FIXN": {POS: PART},
|
||||
"FIXV": {POS: PART},
|
||||
"EAFF": {POS: PART},
|
||||
"AITT": {POS: PART},
|
||||
"NEG": {POS: PART},
|
||||
"PART": {POS: PART},
|
||||
"FIXN": {POS: PART},
|
||||
"FIXV": {POS: PART},
|
||||
"EAFF": {POS: PART},
|
||||
"AITT": {POS: PART},
|
||||
"NEG": {POS: PART},
|
||||
# PUNCT
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"PUNC": {POS: PUNCT}
|
||||
"PUNC": {POS: PUNCT},
|
||||
"_SP": {POS: SPACE}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from libc.string cimport memset
|
||||
|
||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
|
||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
|
||||
from .attrs cimport POS, IS_SPACE
|
||||
from .parts_of_speech import IDS as POS_IDS
|
||||
from .lexeme cimport Lexeme
|
||||
|
@ -36,14 +36,22 @@ cdef class Morphology:
|
|||
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
|
||||
self.mem = Pool()
|
||||
self.strings = string_store
|
||||
# Add special space symbol. We prefix with underscore, to make sure it
|
||||
# always sorts to the end.
|
||||
space_attrs = tag_map.pop('SP', {POS: SPACE})
|
||||
if '_SP' not in tag_map:
|
||||
self.strings.add('_SP')
|
||||
tag_map = dict(tag_map)
|
||||
tag_map['_SP'] = space_attrs
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.tag_map = {}
|
||||
self.lemmatizer = lemmatizer
|
||||
self.n_tags = len(tag_map)
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.reverse_index = {}
|
||||
|
||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
|
||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||
self.strings.add(tag_str)
|
||||
self.tag_map[tag_str] = dict(attrs)
|
||||
attrs = _normalize_props(attrs)
|
||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||
|
@ -93,7 +101,7 @@ cdef class Morphology:
|
|||
# the statistical model fails.
|
||||
# Related to Issue #220
|
||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||
tag_id = self.reverse_index[self.strings.add('SP')]
|
||||
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||
rich_tag = self.rich_tags[tag_id]
|
||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||
if analysis is NULL:
|
||||
|
|
Loading…
Reference in New Issue
Block a user