* Shifting to WordTree instead of dense_hash_map for storage.

This commit is contained in:
Matthew Honnibal 2014-08-15 23:06:46 +02:00
parent f11c8e22eb
commit 3ada25b92d
7 changed files with 137 additions and 101 deletions

View File

@ -1,6 +1,9 @@
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from chartree cimport CharTree
cdef bytes to_utf8(unicode string)
cdef class FixedTable: cdef class FixedTable:
cdef size_t size cdef size_t size
cdef uint64_t* keys cdef uint64_t* keys
@ -9,3 +12,15 @@ cdef class FixedTable:
cdef size_t insert(self, uint64_t key, size_t value) nogil cdef size_t insert(self, uint64_t key, size_t value) nogil
cdef size_t get(self, uint64_t key) nogil cdef size_t get(self, uint64_t key) nogil
cdef int erase(self, uint64_t key) nogil cdef int erase(self, uint64_t key) nogil
cdef class WordTree:
cdef size_t max_length
cdef size_t default
cdef CharTree* _trees
cdef dict _dict
cdef size_t get(self, bytes string) except *
cdef int set(self, bytes string, size_t value) except *
cdef bint contains(self, bytes string) except *

View File

@ -1,6 +1,8 @@
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
import cython import cython
cimport chartree
cdef class FixedTable: cdef class FixedTable:
def __cinit__(self, const size_t size): def __cinit__(self, const size_t size):
@ -51,3 +53,54 @@ cdef class FixedTable:
@cython.cdivision @cython.cdivision
cdef inline size_t _find(uint64_t key, size_t size) nogil: cdef inline size_t _find(uint64_t key, size_t size) nogil:
return key % size return key % size
cdef bytes to_utf8(unicode string):
cdef bytes py_byte_string = string.encode('UTF-8')
return py_byte_string
cdef unicode to_unicode(unsigned char[:] c_string, size_t length):
# This prevents a call to strlen
cdef bytes py_string = <bytes>c_string[:length]
return py_string.decode('utf8')
cdef class WordTree:
def __cinit__(self, size_t default, size_t max_length):
self.max_length = max_length
self.default = default
self._trees = <CharTree*>calloc(max_length, sizeof(CharTree))
for i in range(self.max_length):
chartree.init(&self._trees[i], i)
self._dict = {}
cdef size_t get(self, bytes string) except *:
cdef size_t length = len(string)
if length >= self.max_length:
return self._dict.get(string, 0)
else:
return chartree.getitem(&self._trees[length], string)
cdef int set(self, bytes string, size_t value) except *:
cdef size_t length = len(string)
if length >= self.max_length:
self._dict[string] = value
else:
chartree.setitem(&self._trees[length], string, value)
cdef bint contains(self, bytes string) except *:
cdef size_t length = len(string)
if length >= self.max_length:
return string in self._dict
else:
return chartree.contains(&self._trees[length], string)
def __getitem__(self, unicode key):
return self.get(to_utf8(key))
def __setitem__(self, unicode key, size_t value):
self.set(to_utf8(key), value)
def __contains__(self, unicode key):
return self.contains(to_utf8(key))

View File

@ -43,9 +43,8 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
# Don't count commas as punct if the next char is a number # Don't count commas as punct if the next char is a number
if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
return False return False
# Don't count periods as punct if the next char is not whitespace if word[i] == ".":
if word[i] == "." and i < (length - 1) and not word[i+1].isspace(): return True
return False
return not word[i].isalnum() return not word[i].isalnum()
@ -62,3 +61,6 @@ cpdef Lexeme_addr lookup(unicode string) except 0:
cpdef unicode unhash(StringHash hash_value): cpdef unicode unhash(StringHash hash_value):
return EN.unhash(hash_value) return EN.unhash(hash_value)
def words():
return EN.words

View File

@ -20,6 +20,11 @@ from spacy.spacy cimport StringHash
#SHAPE = StringAttr.shape #SHAPE = StringAttr.shape
#LAST3 = StringAttr.last3 #LAST3 = StringAttr.last3
cdef Lexeme* init(StringHash hashed, bytes lex_string) except NULL:
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.lex = hashed
return word
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0: cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
if attr == SIC: if attr == SIC:

View File

@ -4,6 +4,7 @@ from libc.stdint cimport uint64_t
from sparsehash.dense_hash_map cimport dense_hash_map from sparsehash.dense_hash_map cimport dense_hash_map
from _hashing cimport FixedTable from _hashing cimport FixedTable
from _hashing cimport WordTree
# Circular import problems here # Circular import problems here
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
@ -26,7 +27,7 @@ from spacy.lexeme cimport Orthography
cdef class Language: cdef class Language:
cdef object name cdef object name
cdef Vocab* vocab cdef WordTree vocab
cdef Vocab* distri cdef Vocab* distri
cdef Vocab* ortho cdef Vocab* ortho
cdef dict bacov cdef dict bacov
@ -37,7 +38,7 @@ cdef class Language:
cdef unicode unhash(self, StringHash hashed) cdef unicode unhash(self, StringHash hashed)
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length) cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length) except NULL
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed, cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
int split, size_t length) int split, size_t length)
cdef Orthography* init_orth(self, StringHash hashed, unicode lex) cdef Orthography* init_orth(self, StringHash hashed, unicode lex)

View File

@ -5,12 +5,12 @@ from libc.stdlib cimport calloc, free
from libcpp.pair cimport pair from libcpp.pair cimport pair
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from murmurhash cimport mrmr
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD from spacy.lexeme cimport BLANK_WORD
from spacy.string_tools cimport substr from spacy.string_tools cimport substr
from _hashing cimport WordTree
from _hashing cimport to_utf8
from . import util from . import util
from os import path from os import path
@ -58,28 +58,27 @@ cdef class Language:
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name
self.bacov = {} self.bacov = {}
self.vocab = new Vocab() self.vocab = WordTree(0, 10)
self.ortho = new Vocab() self.ortho = new Vocab()
self.distri = new Vocab() self.distri = new Vocab()
self.vocab[0].set_empty_key(0)
self.distri[0].set_empty_key(0) self.distri[0].set_empty_key(0)
self.ortho[0].set_empty_key(0) self.ortho[0].set_empty_key(0)
self.vocab[0].set_deleted_key(1)
self.distri[0].set_deleted_key(1) self.distri[0].set_deleted_key(1)
self.ortho[0].set_deleted_key(1) self.ortho[0].set_deleted_key(1)
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
property words:
def __get__(self):
return self.bacov.keys()
def load_tokenization(self, token_rules=None): def load_tokenization(self, token_rules=None):
cdef Lexeme* word cdef Lexeme* word
cdef StringHash hashed cdef StringHash hashed
for chunk, lex, tokens in token_rules: for chunk, lex, tokens in token_rules:
hashed = self.hash_string(chunk, len(chunk)) word = self.init_lexeme(chunk)
word = self._add(hashed, lex, len(lex), len(lex))
for i, lex in enumerate(tokens): for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex) token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
length = len(token_string) word.tail = self.init_lexeme(lex)
hashed = self.hash_string(token_string, length)
word.tail = self._add(hashed, lex, 0, len(lex))
word = word.tail word = word.tail
def load_clusters(self): def load_clusters(self):
@ -89,111 +88,59 @@ cdef class Language:
brown_loc = path.join(data_dir, 'clusters') brown_loc = path.join(data_dir, 'clusters')
cdef size_t start cdef size_t start
cdef int end cdef int end
cdef unicode token_unicode
cdef bytes token_bytes
with util.utf8open(brown_loc) as browns_file: with util.utf8open(brown_loc) as browns_file:
for i, line in enumerate(browns_file): for i, line in enumerate(browns_file):
cluster_str, token_string, freq_str = line.split() cluster_str, token_unicode, freq_str = line.split()
token_bytes = token_unicode.encode('utf8')
# Decode as a little-endian string, so that we can do & 15 to get # Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See redshift._parse_features.pyx # the first 4 bits. See redshift._parse_features.pyx
cluster = int(cluster_str[::-1], 2) cluster = int(cluster_str[::-1], 2)
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0)) upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
hashed = self.hash_string(token_string, len(token_string)) word = self.init_lexeme(token_bytes)
word = self._add(hashed, token_string,
len(token_string), len(token_string))
cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
'''Hash unicode with MurmurHash64A'''
return mrmr.hash32(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value].decode('utf8')
cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
'''
if length == 0:
return <Lexeme_addr>&BLANK_WORD
cdef StringHash hashed = self.hash_string(string, length)
# First, check words seen 2+ times
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
if word_ptr == NULL:
start = self.find_split(string, length) if start == -1 else start
word_ptr = self._add(hashed, string, start, length)
return <Lexeme_addr>word_ptr
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
cdef size_t i
word = self.init_lexeme(string, hashed, split, length)
self.vocab[0][hashed] = <size_t>word
self.bacov[hashed] = string.encode('utf8')
return word
cpdef Tokens tokenize(self, unicode string):
cdef size_t length = len(string)
cdef Py_UNICODE* characters = <Py_UNICODE*>string
cdef size_t i
cdef Py_UNICODE c
cpdef Tokens tokenize(self, unicode unicode_string):
cdef bytes characters = unicode_string.encode('utf8')
cdef size_t length = len(characters)
cdef Tokens tokens = Tokens(self) cdef Tokens tokens = Tokens(self)
cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE)) cdef size_t start = 0
cdef size_t word_len = 0
cdef Lexeme* token cdef Lexeme* token
cdef size_t i
cdef unsigned char c
for i in range(length): for i in range(length):
c = characters[i] c = characters[i]
if _is_whitespace(c): if c == b' ':
if word_len != 0: if start < i:
token = <Lexeme*>self.lookup(-1, current, word_len) token = <Lexeme*>self.lookup(characters[start:i])
while token != NULL: while token != NULL:
tokens.append(<Lexeme_addr>token) tokens.append(<Lexeme_addr>token)
token = token.tail token = token.tail
for j in range(word_len+1): start = i + 1
current[j] = 0 if start < i:
word_len = 0 token = <Lexeme*>self.lookup(characters[start:])
else:
current[word_len] = c
word_len += 1
if word_len != 0:
token = <Lexeme*>self.lookup(-1, current, word_len)
while token != NULL: while token != NULL:
tokens.append(<Lexeme_addr>token) tokens.append(<Lexeme_addr>token)
token = token.tail token = token.tail
free(current)
return tokens return tokens
cdef int find_split(self, unicode word, size_t length): cdef Lexeme_addr lookup(self, bytes string) except 0:
return -1 '''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed, reference to BLANK_WORD is returned for the empty string.
int split, size_t length): '''
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme)) cdef size_t length = len(string)
if length == 0:
word.sic = hashed return <Lexeme_addr>&BLANK_WORD
cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
cdef unicode tail_string if word_ptr == NULL:
cdef unicode lex start = self.find_split(string, length)
if split != 0 and split < length: word_ptr = self.init_lexeme(string[)
lex = substr(string, 0, split, length) self.vocab.set(string[start:], <size_t>word_ptr)
tail_string = substr(string, split, length, length) return <Lexeme_addr>word_ptr
else:
lex = string
tail_string = ''
word.lex = self.hash_string(lex, len(lex))
self.bacov[word.lex] = lex.encode('utf8')
word.orth = <Orthography*>self.ortho[0][word.lex]
if word.orth == NULL:
word.orth = self.init_orth(word.lex, lex)
word.dist = <Distribution*>self.distri[0][word.lex]
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
return word
cdef Orthography* init_orth(self, StringHash hashed, unicode lex): cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography)) cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
@ -219,6 +166,13 @@ cdef class Language:
self.ortho[0][hashed] = <size_t>orth self.ortho[0][hashed] = <size_t>orth
return orth return orth
cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value].decode('utf8')
cdef int find_split(self, unicode word, size_t length):
return -1
cdef inline bint _is_whitespace(Py_UNICODE c) nogil: cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
if c == ' ': if c == ' ':

View File

@ -1,4 +1,10 @@
# cython: profile=True # cython: profile=True
from murmurhash cimport mrmr
cdef StringHash hash_string(self, unsigned char* s, size_t length) except 0:
'''Hash bytes with MurmurHash32'''
return mrmr.hash32(s, length * sizeof(unsigned char), 0)
cpdef unicode substr(unicode string, int start, int end, size_t length): cpdef unicode substr(unicode string, int start, int end, size_t length):