* Remove happax stuff

2025-08-09 22:54:53 +03:00 · 2014-08-02 22:11:28 +01:00 · 2014-08-02 22:11:28 +01:00 · f11c8e22eb
commit f11c8e22eb
parent 6050209f65
2 changed files with 3 additions and 25 deletions
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -26,7 +26,6 @@ from spacy.lexeme cimport Orthography

 cdef class Language:
    cdef object name
-    cdef FixedTable happax
    cdef Vocab* vocab
    cdef Vocab* distri
    cdef Vocab* ortho
@ -42,5 +41,3 @@ cdef class Language:
    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
                             int split, size_t length)
    cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
-
-    cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -54,14 +54,10 @@ def set_orth_flags(lex, length):
    return 0


-DEF MAX_HAPPAX = 1048576
-
-
 cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
-        self.happax = FixedTable(MAX_HAPPAX)
        self.vocab = new Vocab()
        self.ortho = new Vocab()
        self.distri = new Vocab()
@ -84,7 +80,6 @@ cdef class Language:
                length = len(token_string)
                hashed = self.hash_string(token_string, length)
                word.tail = self._add(hashed, lex, 0, len(lex))
-                self._happax_to_vocab(hashed, <Lexeme_addr>word.tail)
                word = word.tail

    def load_clusters(self):
@ -126,27 +121,14 @@ cdef class Language:
        # First, check words seen 2+ times
        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
        if word_ptr == NULL:
-            # Now check words seen exactly once
-            word_ptr = <Lexeme*>self.happax.get(hashed)
-            if word_ptr == NULL:
-                start = self.find_split(string, length) if start == -1 else start
-                word_ptr = self._add(hashed, string, start, length)
-            else:
-                # Second time word seen, move to vocab
-                self._happax_to_vocab(hashed, <Lexeme_addr>word_ptr)
+            start = self.find_split(string, length) if start == -1 else start
+            word_ptr = self._add(hashed, string, start, length)
        return <Lexeme_addr>word_ptr

-    cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr):
-        self.vocab[0][hashed] = word_ptr
-        self.happax.erase(hashed)
-
    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
        cdef size_t i
        word = self.init_lexeme(string, hashed, split, length)
-        if self.happax.keys[hashed % self.happax.size] != 0:
-            self._happax_to_vocab(self.happax.keys[hashed % self.happax.size],
-                                  self.happax.values[hashed % self.happax.size])
-        self.happax.insert(hashed, <size_t>word)
+        self.vocab[0][hashed] = <size_t>word
        self.bacov[hashed] = string.encode('utf8')
        return word   

@ -211,7 +193,6 @@ cdef class Language:
        # Now recurse, and deal with the tail
        if tail_string:
            word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
-            self._happax_to_vocab(word.tail.sic, <Lexeme_addr>word.tail)
        return word

    cdef Orthography* init_orth(self, StringHash hashed, unicode lex):