diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index befcf9a2f..6501a8a2b 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -42,4 +42,6 @@ cdef class Language: int split, size_t length) cdef Orthography* init_orth(self, StringHash hashed, unicode lex) + cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr) + diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 68cb23aba..07800faf8 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -85,6 +85,7 @@ cdef class Language: length = len(token_string) hashed = self.hash_string(token_string, length) word.tail = self._add(hashed, lex, 0, len(lex)) + self._happax_to_vocab(hashed, word.tail) word = word.tail def load_clusters(self): @@ -133,19 +134,19 @@ cdef class Language: word_ptr = self._add(hashed, string, start, length) else: # Second time word seen, move to vocab - self.vocab[0][hashed] = word_ptr - self.happax.erase(hashed) + self._happax_to_vocab(hashed, word_ptr) return word_ptr + cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr): + self.vocab[0][hashed] = word_ptr + self.happax.erase(hashed) + cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef size_t i word = self.init_lexeme(string, hashed, split, length) cdef Lexeme* clobbered = self.happax.insert(hashed, word) if clobbered != NULL: - # Can't do this --- we might be pointing to the Lexeme in .tail. - # Fix that to reduce memory, probably. - #free(clobbered) - pass + free(clobbered) self.bacov[hashed] = string return word @@ -210,6 +211,7 @@ cdef class Language: # Now recurse, and deal with the tail if tail_string: word.tail = self.lookup(-1, tail_string, len(tail_string)) + self._happax_to_vocab(word.tail.sic, word.tail) return word cdef Orthography* init_orth(self, StringHash hashed, unicode lex):