* Use a sparse_hash_map to store happax vocab items, with a max size.

This commit is contained in:
Matthew Honnibal 2014-07-31 17:40:43 +01:00
parent a235804730
commit 5b81ee716f
2 changed files with 24 additions and 3 deletions

View File

@ -2,12 +2,14 @@ from libcpp.vector cimport vector
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from sparsehash.dense_hash_map cimport dense_hash_map from sparsehash.dense_hash_map cimport dense_hash_map
from sparsehash.sparse_hash_map cimport sparse_hash_map
# Circular import problems here # Circular import problems here
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
ctypedef uint64_t StringHash ctypedef uint64_t StringHash
ctypedef dense_hash_map[StringHash, size_t] Vocab ctypedef dense_hash_map[StringHash, size_t] Vocab
ctypedef sparse_hash_map[StringHash, size_t] SparseVocab
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens
@ -25,6 +27,7 @@ from spacy.lexeme cimport Orthography
cdef class Language: cdef class Language:
cdef object name cdef object name
cdef SparseVocab* happax
cdef Vocab* vocab cdef Vocab* vocab
cdef Vocab* distri cdef Vocab* distri
cdef Vocab* ortho cdef Vocab* ortho

View File

@ -53,13 +53,18 @@ def set_orth_flags(lex, length):
return 0 return 0
DEF MAX_HAPPAX = 1000000
cdef class Language: cdef class Language:
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name
self.bacov = {} self.bacov = {}
self.happax = new SparseVocab()
self.vocab = new Vocab() self.vocab = new Vocab()
self.ortho = new Vocab() self.ortho = new Vocab()
self.distri = new Vocab() self.distri = new Vocab()
self.happax[0].set_deleted_key(0)
self.vocab[0].set_empty_key(0) self.vocab[0].set_empty_key(0)
self.distri[0].set_empty_key(0) self.distri[0].set_empty_key(0)
self.ortho[0].set_empty_key(0) self.ortho[0].set_empty_key(0)
@ -114,15 +119,28 @@ cdef class Language:
if length == 0: if length == 0:
return <Lexeme_addr>&BLANK_WORD return <Lexeme_addr>&BLANK_WORD
cdef StringHash hashed = self.hash_string(string, length) cdef StringHash hashed = self.hash_string(string, length)
# First, check words seen 2+ times
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed] cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
if word_ptr == NULL: if word_ptr == NULL:
start = self.find_split(string, length) if start == -1 else start # Now check words seen exactly once
word_ptr = self._add(hashed, string, start, length) word_ptr = <Lexeme*>self.happax[0][hashed]
if word_ptr == NULL:
start = self.find_split(string, length) if start == -1 else start
word_ptr = self._add(hashed, string, start, length)
else:
# Second time word seen, move to vocab
self.vocab[0][hashed] = <Lexeme_addr>word_ptr
self.happax[0].erase(hashed)
return <Lexeme_addr>word_ptr return <Lexeme_addr>word_ptr
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
cdef size_t i
cdef sparse_hash_map[StringHash, size_t].iterator it
if self.happax[0].size() >= MAX_HAPPAX:
# Delete last element.
self.happax[0].erase(self.happax[0].end())
word = self.init_lexeme(string, hashed, split, length) word = self.init_lexeme(string, hashed, split, length)
self.vocab[0][hashed] = <Lexeme_addr>word self.happax[0][hashed] = <Lexeme_addr>word
self.bacov[hashed] = string self.bacov[hashed] = string
return word return word