* Restore unicode, work on improving string storage.

This commit is contained in:
Matthew Honnibal 2014-08-16 14:35:34 +02:00
parent a225ca5b0d
commit 36073b89fe

View File

@ -10,9 +10,6 @@ from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD from spacy.lexeme cimport BLANK_WORD
from spacy.string_tools cimport substr from spacy.string_tools cimport substr
from spacy.string_tools cimport to_bytes
from spacy.string_tools cimport from_bytes
from . import util from . import util
from os import path from os import path
@ -26,7 +23,7 @@ def get_normalized(unicode lex, size_t length):
return get_word_shape(lex, length) return get_word_shape(lex, length)
def get_word_shape(lex, length): def get_word_shape(unicode lex, length):
shape = "" shape = ""
last = "" last = ""
shape_char = "" shape_char = ""
@ -71,51 +68,38 @@ cdef class Language:
self.ortho[0].set_deleted_key(1) self.ortho[0].set_deleted_key(1)
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
cpdef Tokens tokenize(self, unicode string): cpdef Tokens tokenize(self, unicode characters):
cdef size_t length = len(string) cdef size_t i = 0
cdef Py_UNICODE* characters = <Py_UNICODE*>string cdef size_t start = 0
cdef size_t i
cdef Py_UNICODE c
cdef Tokens tokens = Tokens(self) cdef Tokens tokens = Tokens(self)
cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
cdef size_t word_len = 0
cdef Lexeme* token cdef Lexeme* token
for i in range(length): for c in characters:
c = characters[i]
if _is_whitespace(c): if _is_whitespace(c):
if word_len != 0: if start < i:
token = <Lexeme*>self.lookup_chunk(current) token = <Lexeme*>self.lookup_chunk(characters[start:i])
while token != NULL: while token != NULL:
tokens.append(<Lexeme_addr>token) tokens.append(<Lexeme_addr>token)
token = token.tail token = token.tail
for j in range(word_len+1): start = i + 1
current[j] = 0 i += 1
word_len = 0 if start < i:
else: token = <Lexeme*>self.lookup_chunk(characters[start:])
current[word_len] = c
word_len += 1
if word_len != 0:
token = <Lexeme*>self.lookup_chunk(current)
while token != NULL: while token != NULL:
tokens.append(<Lexeme_addr>token) tokens.append(<Lexeme_addr>token)
token = token.tail token = token.tail
free(current)
return tokens return tokens
cdef Lexeme_addr lookup(self, unicode string) except 0: cdef Lexeme_addr lookup(self, unicode string) except 0:
cdef size_t length = len(string) cdef size_t length = len(string)
if length == 0: if length == 0:
return <Lexeme_addr>&BLANK_WORD return <Lexeme_addr>&BLANK_WORD
cdef bytes b = to_bytes(string)
cdef StringHash hashed = mrmr.hash32(<char*>b, len(b) * sizeof(char), 0) cdef StringHash hashed = mrmr.hash32(<Py_UNICODE*>string, length * sizeof(Py_UNICODE), 0)
# First, check words seen 2+ times # First, check words seen 2+ times
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed] cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
if word_ptr == NULL: if word_ptr == NULL:
word_ptr = self.new_lexeme(hashed, string) word_ptr = self.new_lexeme(hashed, string)
self.bacov[hashed] = b
return <Lexeme_addr>word_ptr return <Lexeme_addr>word_ptr
cdef Lexeme_addr lookup_chunk(self, unicode string) except 0: cdef Lexeme_addr lookup_chunk(self, unicode string) except 0:
@ -126,7 +110,7 @@ cdef class Language:
cdef size_t length = len(string) cdef size_t length = len(string)
if length == 0: if length == 0:
return <Lexeme_addr>&BLANK_WORD return <Lexeme_addr>&BLANK_WORD
cdef StringHash hashed = self.hash_string(string) cdef StringHash hashed = mrmr.hash32(<Py_UNICODE*>string, length * sizeof(Py_UNICODE), 0)
# First, check words seen 2+ times # First, check words seen 2+ times
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed] cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
cdef int split cdef int split
@ -168,11 +152,11 @@ cdef class Language:
length = len(lex) length = len(lex)
orth = <Orthography*>calloc(1, sizeof(Orthography)) orth = <Orthography*>calloc(1, sizeof(Orthography))
orth.first = <Py_UNICODE>lex[0] orth.first = lex[0]
orth.length = length orth.length = length
orth.flags = set_orth_flags(lex, orth.length) orth.flags = set_orth_flags(lex, orth.length)
orth.norm = hashed
orth.last3 = self.hash_string(substr(lex, length - 3, length, length)) orth.last3 = self.hash_string(substr(lex, length - 3, length, length))
orth.norm = self.hash_string(get_normalized(lex, length)) orth.norm = self.hash_string(get_normalized(lex, length))
orth.shape = self.hash_string(get_word_shape(lex, length)) orth.shape = self.hash_string(get_word_shape(lex, length))
@ -185,17 +169,14 @@ cdef class Language:
self.distri[0][hashed] = <size_t>dist self.distri[0][hashed] = <size_t>dist
return dist return dist
cdef StringHash hash_string(self, unicode s) except 0: cdef StringHash hash_string(self, unicode string) except 0:
'''Hash unicode with MurmurHash64A''' '''Hash unicode with MurmurHash64A'''
cdef bytes byte_string = to_bytes(s) cdef StringHash hashed = mrmr.hash32(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
cdef StringHash hashed = mrmr.hash32(<char*>byte_string, len(byte_string) * sizeof(char), 0)
self.bacov[hashed] = byte_string
return hashed return hashed
cdef unicode unhash(self, StringHash hash_value): cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.''' '''Fetch a string from the reverse index, given its hash value.'''
return from_bytes(self.bacov[hash_value]) return self.bacov[hash_value]
cdef int find_split(self, unicode word, size_t length): cdef int find_split(self, unicode word, size_t length):
return -1 return -1
@ -231,12 +212,12 @@ cdef class Language:
word = self.init_lexeme(hashed, token_string) word = self.init_lexeme(hashed, token_string)
cdef inline bint _is_whitespace(Py_UNICODE c) nogil: cdef inline bint _is_whitespace(unsigned char c) nogil:
if c == ' ': if c == b' ':
return True return True
elif c == '\n': elif c == b'\n':
return True return True
elif c == '\t': elif c == b'\t':
return True return True
else: else:
return False return False