mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
Fix #595: Lemmatization was incorrect for base forms, because morphological analyser wasn't adding morphology properly.
This commit is contained in:
parent
e30348b331
commit
293c79c09a
|
@ -35,6 +35,8 @@ cdef class Morphology:
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1
|
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||||
|
|
||||||
|
cdef int _assign_tag_id(self, TokenC* token, int tag_id) except -1
|
||||||
|
|
||||||
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
|
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -35,15 +35,15 @@ cdef class Morphology:
|
||||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
|
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||||
cdef int tag_id
|
|
||||||
if isinstance(tag, basestring):
|
if isinstance(tag, basestring):
|
||||||
tag_id = self.reverse_index[self.strings[tag]]
|
tag_id = self.reverse_index[self.strings[tag]]
|
||||||
tag_str = tag
|
|
||||||
else:
|
else:
|
||||||
tag_id = tag
|
tag_id = self.reverse_index[tag]
|
||||||
tag_str = self.strings[tag]
|
self.assign_tag_id(token, tag_id)
|
||||||
|
|
||||||
|
cdef int _assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||||
if tag_id >= self.n_tags:
|
if tag_id >= self.n_tags:
|
||||||
raise ValueError("Unknown tag: %s" % tag)
|
raise ValueError("Unknown tag ID: %s" % tag_id)
|
||||||
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
||||||
# is that this is where the specific word and the tag interact. Still,
|
# is that this is where the specific word and the tag interact. Still,
|
||||||
# we should have a better way to enforce this rule, or figure out why
|
# we should have a better way to enforce this rule, or figure out why
|
||||||
|
@ -55,6 +55,7 @@ cdef class Morphology:
|
||||||
if analysis is NULL:
|
if analysis is NULL:
|
||||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||||
analysis.tag = self.rich_tags[tag_id]
|
analysis.tag = self.rich_tags[tag_id]
|
||||||
|
tag_str = self.strings[self.rich_tags[tag_id].name]
|
||||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
||||||
**self.tag_map.get(tag_str, {}))
|
**self.tag_map.get(tag_str, {}))
|
||||||
self._cache.set(tag_id, token.lex.orth, analysis)
|
self._cache.set(tag_id, token.lex.orth, analysis)
|
||||||
|
|
|
@ -174,11 +174,7 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.tag
|
return self.c.tag
|
||||||
def __set__(self, int tag):
|
def __set__(self, int tag):
|
||||||
# TODO: The behaviour here --- that it fails when we don't have the
|
self.vocab.morphology.assign_tag(self.c, tag)
|
||||||
# tag in the 'reverse index' --- really sucks. But we can't fix it
|
|
||||||
# here if we don't fix it elsewhere...
|
|
||||||
self.vocab.morphology.assign_tag(self.c,
|
|
||||||
self.vocab.morphology.reverse_index[tag])
|
|
||||||
|
|
||||||
property dep:
|
property dep:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user