mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Restore unicode, work on improving string storage.
This commit is contained in:
parent
a225ca5b0d
commit
36073b89fe
|
@ -10,9 +10,6 @@ from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport BLANK_WORD
|
from spacy.lexeme cimport BLANK_WORD
|
||||||
|
|
||||||
from spacy.string_tools cimport substr
|
from spacy.string_tools cimport substr
|
||||||
from spacy.string_tools cimport to_bytes
|
|
||||||
from spacy.string_tools cimport from_bytes
|
|
||||||
|
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
from os import path
|
from os import path
|
||||||
|
@ -26,7 +23,7 @@ def get_normalized(unicode lex, size_t length):
|
||||||
return get_word_shape(lex, length)
|
return get_word_shape(lex, length)
|
||||||
|
|
||||||
|
|
||||||
def get_word_shape(lex, length):
|
def get_word_shape(unicode lex, length):
|
||||||
shape = ""
|
shape = ""
|
||||||
last = ""
|
last = ""
|
||||||
shape_char = ""
|
shape_char = ""
|
||||||
|
@ -71,51 +68,38 @@ cdef class Language:
|
||||||
self.ortho[0].set_deleted_key(1)
|
self.ortho[0].set_deleted_key(1)
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode string):
|
cpdef Tokens tokenize(self, unicode characters):
|
||||||
cdef size_t length = len(string)
|
cdef size_t i = 0
|
||||||
cdef Py_UNICODE* characters = <Py_UNICODE*>string
|
cdef size_t start = 0
|
||||||
|
|
||||||
cdef size_t i
|
|
||||||
cdef Py_UNICODE c
|
|
||||||
|
|
||||||
cdef Tokens tokens = Tokens(self)
|
cdef Tokens tokens = Tokens(self)
|
||||||
cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
|
|
||||||
cdef size_t word_len = 0
|
|
||||||
cdef Lexeme* token
|
cdef Lexeme* token
|
||||||
for i in range(length):
|
for c in characters:
|
||||||
c = characters[i]
|
|
||||||
if _is_whitespace(c):
|
if _is_whitespace(c):
|
||||||
if word_len != 0:
|
if start < i:
|
||||||
token = <Lexeme*>self.lookup_chunk(current)
|
token = <Lexeme*>self.lookup_chunk(characters[start:i])
|
||||||
while token != NULL:
|
while token != NULL:
|
||||||
tokens.append(<Lexeme_addr>token)
|
tokens.append(<Lexeme_addr>token)
|
||||||
token = token.tail
|
token = token.tail
|
||||||
for j in range(word_len+1):
|
start = i + 1
|
||||||
current[j] = 0
|
i += 1
|
||||||
word_len = 0
|
if start < i:
|
||||||
else:
|
token = <Lexeme*>self.lookup_chunk(characters[start:])
|
||||||
current[word_len] = c
|
|
||||||
word_len += 1
|
|
||||||
if word_len != 0:
|
|
||||||
token = <Lexeme*>self.lookup_chunk(current)
|
|
||||||
while token != NULL:
|
while token != NULL:
|
||||||
tokens.append(<Lexeme_addr>token)
|
tokens.append(<Lexeme_addr>token)
|
||||||
token = token.tail
|
token = token.tail
|
||||||
free(current)
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef Lexeme_addr lookup(self, unicode string) except 0:
|
cdef Lexeme_addr lookup(self, unicode string) except 0:
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return <Lexeme_addr>&BLANK_WORD
|
return <Lexeme_addr>&BLANK_WORD
|
||||||
cdef bytes b = to_bytes(string)
|
|
||||||
|
|
||||||
cdef StringHash hashed = mrmr.hash32(<char*>b, len(b) * sizeof(char), 0)
|
cdef StringHash hashed = mrmr.hash32(<Py_UNICODE*>string, length * sizeof(Py_UNICODE), 0)
|
||||||
# First, check words seen 2+ times
|
# First, check words seen 2+ times
|
||||||
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
||||||
if word_ptr == NULL:
|
if word_ptr == NULL:
|
||||||
word_ptr = self.new_lexeme(hashed, string)
|
word_ptr = self.new_lexeme(hashed, string)
|
||||||
self.bacov[hashed] = b
|
|
||||||
return <Lexeme_addr>word_ptr
|
return <Lexeme_addr>word_ptr
|
||||||
|
|
||||||
cdef Lexeme_addr lookup_chunk(self, unicode string) except 0:
|
cdef Lexeme_addr lookup_chunk(self, unicode string) except 0:
|
||||||
|
@ -126,7 +110,7 @@ cdef class Language:
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return <Lexeme_addr>&BLANK_WORD
|
return <Lexeme_addr>&BLANK_WORD
|
||||||
cdef StringHash hashed = self.hash_string(string)
|
cdef StringHash hashed = mrmr.hash32(<Py_UNICODE*>string, length * sizeof(Py_UNICODE), 0)
|
||||||
# First, check words seen 2+ times
|
# First, check words seen 2+ times
|
||||||
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
||||||
cdef int split
|
cdef int split
|
||||||
|
@ -168,11 +152,11 @@ cdef class Language:
|
||||||
|
|
||||||
length = len(lex)
|
length = len(lex)
|
||||||
orth = <Orthography*>calloc(1, sizeof(Orthography))
|
orth = <Orthography*>calloc(1, sizeof(Orthography))
|
||||||
orth.first = <Py_UNICODE>lex[0]
|
orth.first = lex[0]
|
||||||
|
|
||||||
orth.length = length
|
orth.length = length
|
||||||
orth.flags = set_orth_flags(lex, orth.length)
|
orth.flags = set_orth_flags(lex, orth.length)
|
||||||
|
orth.norm = hashed
|
||||||
orth.last3 = self.hash_string(substr(lex, length - 3, length, length))
|
orth.last3 = self.hash_string(substr(lex, length - 3, length, length))
|
||||||
orth.norm = self.hash_string(get_normalized(lex, length))
|
orth.norm = self.hash_string(get_normalized(lex, length))
|
||||||
orth.shape = self.hash_string(get_word_shape(lex, length))
|
orth.shape = self.hash_string(get_word_shape(lex, length))
|
||||||
|
@ -185,17 +169,14 @@ cdef class Language:
|
||||||
self.distri[0][hashed] = <size_t>dist
|
self.distri[0][hashed] = <size_t>dist
|
||||||
return dist
|
return dist
|
||||||
|
|
||||||
cdef StringHash hash_string(self, unicode s) except 0:
|
cdef StringHash hash_string(self, unicode string) except 0:
|
||||||
'''Hash unicode with MurmurHash64A'''
|
'''Hash unicode with MurmurHash64A'''
|
||||||
cdef bytes byte_string = to_bytes(s)
|
cdef StringHash hashed = mrmr.hash32(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
cdef StringHash hashed = mrmr.hash32(<char*>byte_string, len(byte_string) * sizeof(char), 0)
|
|
||||||
self.bacov[hashed] = byte_string
|
|
||||||
return hashed
|
return hashed
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hash_value):
|
cdef unicode unhash(self, StringHash hash_value):
|
||||||
'''Fetch a string from the reverse index, given its hash value.'''
|
'''Fetch a string from the reverse index, given its hash value.'''
|
||||||
return from_bytes(self.bacov[hash_value])
|
return self.bacov[hash_value]
|
||||||
|
|
||||||
cdef int find_split(self, unicode word, size_t length):
|
cdef int find_split(self, unicode word, size_t length):
|
||||||
return -1
|
return -1
|
||||||
|
@ -231,12 +212,12 @@ cdef class Language:
|
||||||
word = self.init_lexeme(hashed, token_string)
|
word = self.init_lexeme(hashed, token_string)
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
cdef inline bint _is_whitespace(unsigned char c) nogil:
|
||||||
if c == ' ':
|
if c == b' ':
|
||||||
return True
|
return True
|
||||||
elif c == '\n':
|
elif c == b'\n':
|
||||||
return True
|
return True
|
||||||
elif c == '\t':
|
elif c == b'\t':
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
Loading…
Reference in New Issue
Block a user