diff --git a/setup.py b/setup.py index eadfade84..50a8dd271 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ exts = [ Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes), Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes), + Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes), Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++", include_dirs=includes), diff --git a/spacy/__init__.py b/spacy/__init__.py index d2b763c42..9f7c7932c 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,5 +1,6 @@ from .lexeme import lex_of from .lexeme import sic_of +from .lexeme import length_of from .tokens import Tokens @@ -10,28 +11,6 @@ LEX = 1 NORM = 2 SHAPE = 3 LAST3 = 4 +LENGTH = 5 -__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3] - - -""" -from .tokens import ids_from_string -from .tokens import group_by - -from .lex import sic_of -from .lex import lex_of -from .lex import normed_of -from .lex import first_of -from .lex import last_three_of - -from .lex import cluster_of -from .lex import prob_of - -from .lex import is_oft_upper -from .lex import is_oft_title - -from .lex import can_noun -from .lex import can_verb -from .lex import can_adj -from .lex import can_adv -""" +__all__ = [Tokens, lex_of, sic_of, length_of, SIC, LEX, NORM, SHAPE, LAST3, LENGTH] diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx index 72a324673..99c8e7406 100644 --- a/spacy/_hashing.pyx +++ b/spacy/_hashing.pyx @@ -51,5 +51,3 @@ cdef class FixedTable: @cython.cdivision cdef inline size_t _find(uint64_t key, size_t size) nogil: return key % size - - diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index fdb43df74..f5316a618 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -2,6 +2,7 @@ from libcpp.vector cimport vector from libc.stdint cimport uint64_t from sparsehash.dense_hash_map cimport dense_hash_map +from _hashing cimport FixedTable # Circular import problems here ctypedef size_t Lexeme_addr @@ -24,6 +25,7 @@ from spacy.lexeme cimport Orthography cdef class Language: cdef object name + cdef FixedTable happax cdef Vocab* vocab cdef Vocab* distri cdef Vocab* ortho @@ -39,3 +41,5 @@ cdef class Language: cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed, int split, size_t length) cdef Orthography* init_orth(self, StringHash hashed, unicode lex) + + cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index d896b922b..1e31ecdb2 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -50,15 +50,18 @@ def get_word_shape(lex, length): return shape - def set_orth_flags(lex, length): return 0 +DEF MAX_HAPPAX = 1048576 + + cdef class Language: def __cinit__(self, name): self.name = name self.bacov = {} + self.happax = FixedTable(MAX_HAPPAX) self.vocab = new Vocab() self.ortho = new Vocab() self.distri = new Vocab() @@ -81,6 +84,7 @@ cdef class Language: length = len(token_string) hashed = self.hash_string(token_string, length) word.tail = self._add(hashed, lex, 0, len(lex)) + self._happax_to_vocab(hashed, word.tail) word = word.tail def load_clusters(self): @@ -122,14 +126,27 @@ cdef class Language: # First, check words seen 2+ times cdef Lexeme* word_ptr = self.vocab[0][hashed] if word_ptr == NULL: - start = self.find_split(string, length) if start == -1 else start - word_ptr = self._add(hashed, string, start, length) + # Now check words seen exactly once + word_ptr = self.happax.get(hashed) + if word_ptr == NULL: + start = self.find_split(string, length) if start == -1 else start + word_ptr = self._add(hashed, string, start, length) + else: + # Second time word seen, move to vocab + self._happax_to_vocab(hashed, word_ptr) return word_ptr + cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr): + self.vocab[0][hashed] = word_ptr + self.happax.erase(hashed) + cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef size_t i word = self.init_lexeme(string, hashed, split, length) - self.vocab[0][hashed] = word + if self.happax.keys[hashed % self.happax.size] != 0: + self._happax_to_vocab(self.happax.keys[hashed % self.happax.size], + self.happax.values[hashed % self.happax.size]) + self.happax.insert(hashed, word) self.bacov[hashed] = string return word @@ -194,6 +211,7 @@ cdef class Language: # Now recurse, and deal with the tail if tail_string: word.tail = self.lookup(-1, tail_string, len(tail_string)) + self._happax_to_vocab(word.tail.sic, word.tail) return word cdef Orthography* init_orth(self, StringHash hashed, unicode lex): diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 3e26b1cea..1b0d42981 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc from spacy.lexeme cimport Lexeme -from spacy.lexeme cimport attr_of, norm_of, shape_of +from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of from spacy.spacy cimport StringHash