mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-03 02:17:00 +03:00
Ratty implementation of morphology features
This commit is contained in:
parent
9c32388235
commit
9db60acd7c
|
@ -107,6 +107,7 @@ cdef class Morphology:
|
||||||
# justification is that this is where the specific word and the tag
|
# justification is that this is where the specific word and the tag
|
||||||
# interact. Still, we should have a better way to enforce this rule, or
|
# interact. Still, we should have a better way to enforce this rule, or
|
||||||
# figure out why the statistical model fails. Related to Issue #220
|
# figure out why the statistical model fails. Related to Issue #220
|
||||||
|
previous_features = self.get_features(token.morph)
|
||||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||||
tag_id = self.reverse_index[self.strings.add('_SP')]
|
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||||
rich_tag = self.rich_tags[tag_id]
|
rich_tag = self.rich_tags[tag_id]
|
||||||
|
@ -122,6 +123,8 @@ cdef class Morphology:
|
||||||
token.pos = analysis.tag.pos
|
token.pos = analysis.tag.pos
|
||||||
token.tag = analysis.tag.name
|
token.tag = analysis.tag.name
|
||||||
token.morph = analysis.tag.morph
|
token.morph = analysis.tag.morph
|
||||||
|
for feature in previous_features:
|
||||||
|
self.set_feature(&token.morph, feature, True)
|
||||||
|
|
||||||
cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
|
cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
|
||||||
# Deprecated
|
# Deprecated
|
||||||
|
@ -146,7 +149,10 @@ cdef class Morphology:
|
||||||
self._morph2features[morph[0]] = new_features
|
self._morph2features[morph[0]] = new_features
|
||||||
|
|
||||||
def get_features(self, uint64_t morph):
|
def get_features(self, uint64_t morph):
|
||||||
return self._morph2features.get(morph, frozenset())
|
if morph in self._morph2features:
|
||||||
|
return self._morph2features[morph]
|
||||||
|
else:
|
||||||
|
return frozenset()
|
||||||
|
|
||||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
||||||
force=False):
|
force=False):
|
||||||
|
|
|
@ -484,9 +484,11 @@ class Tagger(Pipe):
|
||||||
new_tag_map[tag] = {POS: X}
|
new_tag_map[tag] = {POS: X}
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
if new_tag_map:
|
if new_tag_map:
|
||||||
|
morph_feats = self.vocab.morphology._morph2features
|
||||||
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
||||||
vocab.morphology.lemmatizer,
|
vocab.morphology.lemmatizer,
|
||||||
exc=vocab.morphology.exc)
|
exc=vocab.morphology.exc)
|
||||||
|
vocab.morphology._morph2features = morph_feats
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||||
|
@ -519,10 +521,12 @@ class Tagger(Pipe):
|
||||||
if values is None:
|
if values is None:
|
||||||
values = {POS: "X"}
|
values = {POS: "X"}
|
||||||
tag_map[label] = values
|
tag_map[label] = values
|
||||||
|
morph_feats = Morphology._morph2features
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology = Morphology(
|
||||||
self.vocab.strings, tag_map=tag_map,
|
self.vocab.strings, tag_map=tag_map,
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
exc=self.vocab.morphology.exc)
|
exc=self.vocab.morphology.exc)
|
||||||
|
self.vocab.morphology._morph2features = morph_feats
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
|
@ -554,10 +558,12 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def load_tag_map(b):
|
def load_tag_map(b):
|
||||||
tag_map = msgpack.loads(b, encoding='utf8')
|
tag_map = msgpack.loads(b, encoding='utf8')
|
||||||
|
morph_feats = self.vocab.morphology._morph2features
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology = Morphology(
|
||||||
self.vocab.strings, tag_map=tag_map,
|
self.vocab.strings, tag_map=tag_map,
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
exc=self.vocab.morphology.exc)
|
exc=self.vocab.morphology.exc)
|
||||||
|
self.vocab.morphology._morph2features = morph_feats
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
|
@ -590,10 +596,12 @@ class Tagger(Pipe):
|
||||||
def load_tag_map(p):
|
def load_tag_map(p):
|
||||||
with p.open('rb') as file_:
|
with p.open('rb') as file_:
|
||||||
tag_map = msgpack.loads(file_.read(), encoding='utf8')
|
tag_map = msgpack.loads(file_.read(), encoding='utf8')
|
||||||
|
morph_feats = self.vocab.morphology._morph2features
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology = Morphology(
|
||||||
self.vocab.strings, tag_map=tag_map,
|
self.vocab.strings, tag_map=tag_map,
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
exc=self.vocab.morphology.exc)
|
exc=self.vocab.morphology.exc)
|
||||||
|
self.vocab.morphology._morph2features = morph_feats
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user