From 293c79c09a771bb50cfa05a1fdeab2146cdca4b3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 4 Nov 2016 00:29:07 +0100 Subject: [PATCH] Fix #595: Lemmatization was incorrect for base forms, because morphological analyser wasn't adding morphology properly. --- spacy/morphology.pxd | 2 ++ spacy/morphology.pyx | 11 ++++++----- spacy/tokens/token.pyx | 6 +----- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 847626158..ad9d61eab 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -35,6 +35,8 @@ cdef class Morphology: cdef int assign_tag(self, TokenC* token, tag) except -1 + cdef int _assign_tag_id(self, TokenC* token, int tag_id) except -1 + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index e55fcc155..b1302454f 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -35,15 +35,15 @@ cdef class Morphology: return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None) cdef int assign_tag(self, TokenC* token, tag) except -1: - cdef int tag_id if isinstance(tag, basestring): tag_id = self.reverse_index[self.strings[tag]] - tag_str = tag else: - tag_id = tag - tag_str = self.strings[tag] + tag_id = self.reverse_index[tag] + self.assign_tag_id(token, tag_id) + + cdef int _assign_tag_id(self, TokenC* token, int tag_id) except -1: if tag_id >= self.n_tags: - raise ValueError("Unknown tag: %s" % tag) + raise ValueError("Unknown tag ID: %s" % tag_id) # TODO: It's pretty arbitrary to put this logic here. I guess the justification # is that this is where the specific word and the tag interact. Still, # we should have a better way to enforce this rule, or figure out why @@ -55,6 +55,7 @@ cdef class Morphology: if analysis is NULL: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) analysis.tag = self.rich_tags[tag_id] + tag_str = self.strings[self.rich_tags[tag_id].name] analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, **self.tag_map.get(tag_str, {})) self._cache.set(tag_id, token.lex.orth, analysis) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 91ddc13cb..26cdae786 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -174,11 +174,7 @@ cdef class Token: def __get__(self): return self.c.tag def __set__(self, int tag): - # TODO: The behaviour here --- that it fails when we don't have the - # tag in the 'reverse index' --- really sucks. But we can't fix it - # here if we don't fix it elsewhere... - self.vocab.morphology.assign_tag(self.c, - self.vocab.morphology.reverse_index[tag]) + self.vocab.morphology.assign_tag(self.c, tag) property dep: def __get__(self):