mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++
This commit is contained in:
parent
aaae66114c
commit
ff1869ff07
|
@ -1,11 +1,13 @@
|
||||||
|
# cython profile=True
|
||||||
|
|
||||||
from libc.stdint cimport uint64_t, int64_t
|
from libc.stdint cimport uint64_t, int64_t
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "../include/MurmurHash3.h":
|
cdef extern from "../include/MurmurHash3.h":
|
||||||
void MurmurHash3_x86_32(void * key, uint64_t len, uint64_t seed, void* out)
|
void MurmurHash3_x86_32(void * key, uint64_t len, uint64_t seed, void* out) nogil
|
||||||
void MurmurHash3_x86_128(void * key, uint64_t len, uint64_t seed, void* out)
|
void MurmurHash3_x86_128(void * key, uint64_t len, uint64_t seed, void* out) nogil
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "../include/MurmurHash2.h":
|
cdef extern from "../include/MurmurHash2.h":
|
||||||
uint64_t MurmurHash64A(void * key, uint64_t len, int64_t seed)
|
uint64_t MurmurHash64A(void * key, uint64_t len, int64_t seed) nogil
|
||||||
uint64_t MurmurHash64B(void * key, uint64_t len, int64_t seed)
|
uint64_t MurmurHash64B(void * key, uint64_t len, int64_t seed) nogil
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
# cython: profile=True
|
|
@ -0,0 +1 @@
|
||||||
|
# cython profile=True
|
|
@ -6,7 +6,7 @@ from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport Lexeme_addr
|
from spacy.lexeme cimport Lexeme_addr
|
||||||
|
|
||||||
|
|
||||||
cdef Vocab VOCAB
|
cdef Vocab* VOCAB
|
||||||
cdef dict BACOV
|
cdef dict BACOV
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# cython: profile=True
|
||||||
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
||||||
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
||||||
boldly assume no collisions.
|
boldly assume no collisions.
|
||||||
|
@ -15,19 +16,18 @@ from . import util
|
||||||
cimport spacy
|
cimport spacy
|
||||||
|
|
||||||
BACOV = {}
|
BACOV = {}
|
||||||
VOCAB = Vocab()
|
VOCAB = new Vocab(100000)
|
||||||
VOCAB.set_empty_key(0)
|
VOCAB.set_empty_key(0)
|
||||||
|
|
||||||
|
|
||||||
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
|
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
|
||||||
|
|
||||||
|
|
||||||
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
|
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
|
||||||
return spacy.tokenize(VOCAB, BACOV, find_split, string)
|
return spacy.tokenize(VOCAB, BACOV, find_split, string)
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||||
return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
|
return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string))
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode unhash(StringHash hash_value):
|
cpdef unicode unhash(StringHash hash_value):
|
||||||
|
@ -72,3 +72,6 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||||
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
|
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
|
||||||
return False
|
return False
|
||||||
return not word[i].isalnum()
|
return not word[i].isalnum()
|
||||||
|
|
||||||
|
|
||||||
|
#spacy.load_browns(VOCAB, BACOV, find_split)
|
||||||
|
|
|
@ -6,7 +6,7 @@ from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport Lexeme_addr
|
from spacy.lexeme cimport Lexeme_addr
|
||||||
|
|
||||||
|
|
||||||
cdef Vocab VOCAB
|
cdef Vocab* VOCAB
|
||||||
cdef dict BACOV
|
cdef dict BACOV
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ from . import util
|
||||||
cimport spacy
|
cimport spacy
|
||||||
|
|
||||||
BACOV = {}
|
BACOV = {}
|
||||||
VOCAB = Vocab()
|
VOCAB = new Vocab(100000)
|
||||||
VOCAB.set_empty_key(0)
|
VOCAB.set_empty_key(0)
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||||
return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
|
return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string))
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode unhash(StringHash hash_value):
|
cpdef unicode unhash(StringHash hash_value):
|
||||||
|
|
|
@ -25,9 +25,9 @@ cdef struct Lexeme:
|
||||||
|
|
||||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
|
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
|
||||||
|
|
||||||
cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
|
cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
|
||||||
unicode string, StringHash hashed,
|
unicode string, StringHash hashed,
|
||||||
int split, size_t length) except NULL
|
int split, size_t length)
|
||||||
|
|
||||||
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
|
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
|
||||||
# has a conditional to pick out the correct item. This allows safe iteration
|
# has a conditional to pick out the correct item. This allows safe iteration
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# cython: profile=True
|
||||||
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
|
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
|
||||||
Mostly useful from Python-space. From Cython-space, you can just cast to
|
Mostly useful from Python-space. From Cython-space, you can just cast to
|
||||||
Lexeme* yourself.
|
Lexeme* yourself.
|
||||||
|
@ -13,9 +14,9 @@ from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
|
cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
|
||||||
unicode string, StringHash hashed,
|
unicode string, StringHash hashed,
|
||||||
int split, size_t length) except NULL:
|
int split, size_t length):
|
||||||
assert split <= length
|
assert split <= length
|
||||||
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
||||||
|
|
||||||
|
@ -54,7 +55,8 @@ cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
|
||||||
|
|
||||||
# Now recurse, and deal with the tail
|
# Now recurse, and deal with the tail
|
||||||
if tail_string:
|
if tail_string:
|
||||||
word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string)
|
word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string,
|
||||||
|
len(tail_string))
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,12 +12,13 @@ ctypedef int (*Splitter)(unicode word, size_t length)
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
cdef load_tokenization(Vocab& vocab, dict bacov, token_rules)
|
cdef load_tokenization(Vocab* vocab, dict bacov, token_rules)
|
||||||
cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
|
cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split)
|
||||||
|
cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
|
||||||
unicode string) except *
|
unicode string) except *
|
||||||
cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start,
|
cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter splitter, int start,
|
||||||
unicode string) except 0
|
Py_UNICODE* string, size_t length) except 0
|
||||||
cdef StringHash hash_string(unicode s, size_t length) except 0
|
cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil
|
||||||
cdef unicode unhash(dict bacov, StringHash hash_value)
|
cdef unicode unhash(dict bacov, StringHash hash_value)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# cython: profile=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ext.murmurhash cimport MurmurHash64A
|
from ext.murmurhash cimport MurmurHash64A
|
||||||
|
@ -9,14 +10,16 @@ from spacy.lexeme cimport BLANK_WORD
|
||||||
from spacy.string_tools cimport is_whitespace
|
from spacy.string_tools cimport is_whitespace
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
|
from os import path
|
||||||
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
|
cdef load_tokenization(Vocab* vocab, dict bacov, token_rules):
|
||||||
cdef Lexeme* word
|
cdef Lexeme* word
|
||||||
cdef StringHash hashed
|
cdef StringHash hashed
|
||||||
for chunk, lex, tokens in token_rules:
|
for chunk, lex, tokens in token_rules:
|
||||||
hashed = hash_string(chunk, len(chunk))
|
hashed = hash_string(chunk, len(chunk))
|
||||||
assert vocab[hashed] == 0, chunk
|
assert vocab[0][hashed] == 0, chunk
|
||||||
word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
|
word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
|
||||||
for i, lex in enumerate(tokens):
|
for i, lex in enumerate(tokens):
|
||||||
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
|
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
|
||||||
|
@ -26,7 +29,29 @@ cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
|
||||||
word = word.tail
|
word = word.tail
|
||||||
|
|
||||||
|
|
||||||
cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
|
cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split):
|
||||||
|
cdef Lexeme* w
|
||||||
|
data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
|
||||||
|
case_stats = util.load_case_stats(data_dir)
|
||||||
|
brown_loc = path.join(data_dir, 'clusters')
|
||||||
|
cdef size_t start
|
||||||
|
cdef int end
|
||||||
|
with util.utf8open(brown_loc) as browns_file:
|
||||||
|
for i, line in enumerate(browns_file):
|
||||||
|
cluster_str, token_string, freq_str = line.split()
|
||||||
|
# Decode as a little-endian string, so that we can do & 15 to get
|
||||||
|
# the first 4 bits. See redshift._parse_features.pyx
|
||||||
|
cluster = int(cluster_str[::-1], 2)
|
||||||
|
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
|
||||||
|
start = 0
|
||||||
|
end = -1
|
||||||
|
hashed = hash_string(token_string, len(token_string))
|
||||||
|
|
||||||
|
word = _add(vocab, bacov, find_split, hashed, token_string,
|
||||||
|
len(token_string), len(token_string))
|
||||||
|
|
||||||
|
|
||||||
|
cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
|
||||||
unicode string) except *:
|
unicode string) except *:
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
cdef Py_UNICODE* characters = <Py_UNICODE*>string
|
cdef Py_UNICODE* characters = <Py_UNICODE*>string
|
||||||
|
@ -35,40 +60,54 @@ cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
|
||||||
cdef Py_UNICODE c
|
cdef Py_UNICODE c
|
||||||
|
|
||||||
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
|
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
|
||||||
cdef unicode current = u''
|
cdef Py_UNICODE[1000] current
|
||||||
|
for i in range(1000):
|
||||||
|
current[i] = 0
|
||||||
|
cdef size_t word_len = 0
|
||||||
cdef Lexeme* token
|
cdef Lexeme* token
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
c = characters[i]
|
c = characters[i]
|
||||||
if is_whitespace(c):
|
if _is_whitespace(c):
|
||||||
if current:
|
if word_len != 0:
|
||||||
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
|
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current, word_len)
|
||||||
while token != NULL:
|
while token != NULL:
|
||||||
tokens.push_back(<Lexeme_addr>token)
|
tokens.push_back(<Lexeme_addr>token)
|
||||||
token = token.tail
|
token = token.tail
|
||||||
current = u''
|
for j in range(word_len+1):
|
||||||
|
current[j] = 0
|
||||||
|
word_len = 0
|
||||||
else:
|
else:
|
||||||
current += c
|
current[word_len] = c
|
||||||
if current:
|
word_len += 1
|
||||||
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
|
if word_len != 0:
|
||||||
|
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current, word_len)
|
||||||
while token != NULL:
|
while token != NULL:
|
||||||
tokens.push_back(<Lexeme_addr>token)
|
tokens.push_back(<Lexeme_addr>token)
|
||||||
token = token.tail
|
token = token.tail
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||||
|
if c == ' ':
|
||||||
|
return True
|
||||||
|
elif c == '\n':
|
||||||
|
return True
|
||||||
|
elif c == '\t':
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start,
|
cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter find_split, int start,
|
||||||
unicode string) except 0:
|
Py_UNICODE* string, size_t length) except 0:
|
||||||
'''Fetch a Lexeme representing a word string. If the word has not been seen,
|
'''Fetch a Lexeme representing a word string. If the word has not been seen,
|
||||||
construct one, splitting off any attached punctuation or clitics. A
|
construct one, splitting off any attached punctuation or clitics. A
|
||||||
reference to BLANK_WORD is returned for the empty string.
|
reference to BLANK_WORD is returned for the empty string.
|
||||||
|
|
||||||
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
|
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
|
||||||
'''
|
'''
|
||||||
if string == '':
|
if length == 0:
|
||||||
return <Lexeme_addr>&BLANK_WORD
|
return <Lexeme_addr>&BLANK_WORD
|
||||||
cdef size_t length = len(string)
|
|
||||||
cdef StringHash hashed = hash_string(string, length)
|
cdef StringHash hashed = hash_string(string, length)
|
||||||
cdef Lexeme* word_ptr = <Lexeme*>vocab[hashed]
|
cdef Lexeme* word_ptr = <Lexeme*>vocab[0][hashed]
|
||||||
if word_ptr == NULL:
|
if word_ptr == NULL:
|
||||||
start = find_split(string, length) if start == -1 else start
|
start = find_split(string, length) if start == -1 else start
|
||||||
word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
|
word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
|
||||||
|
@ -84,9 +123,8 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
cdef StringHash hash_string(unicode s, size_t length) except 0:
|
cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil:
|
||||||
'''Hash unicode with MurmurHash64A'''
|
'''Hash unicode with MurmurHash64A'''
|
||||||
assert length
|
|
||||||
return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
|
return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
|
|
||||||
|
@ -95,11 +133,12 @@ cdef unicode unhash(dict bacov, StringHash hash_value):
|
||||||
return bacov[hash_value]
|
return bacov[hash_value]
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed,
|
@cython.nonecheck(False)
|
||||||
unicode string, int split, size_t length) except NULL:
|
cdef Lexeme* _add(Vocab* vocab, dict bacov, Splitter find_split, StringHash hashed,
|
||||||
assert string
|
unicode string, int split, size_t length):
|
||||||
assert split <= length
|
|
||||||
word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
|
word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
|
||||||
vocab[hashed] = <Lexeme_addr>word
|
vocab[0][hashed] = <Lexeme_addr>word
|
||||||
bacov[hashed] = string
|
bacov[hashed] = string
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
# cython: profile=True
|
||||||
|
|
||||||
cpdef unicode substr(unicode string, int start, int end, size_t length):
|
cpdef unicode substr(unicode string, int start, int end, size_t length):
|
||||||
if end >= length:
|
if end >= length:
|
||||||
end = -1
|
end = -1
|
||||||
|
|
|
@ -10,7 +10,7 @@ def utf8open(loc, mode='r'):
|
||||||
|
|
||||||
|
|
||||||
def load_case_stats(data_dir):
|
def load_case_stats(data_dir):
|
||||||
case_loc = path.join(data_dir, 'english.case')
|
case_loc = path.join(data_dir, 'case')
|
||||||
case_stats = {}
|
case_stats = {}
|
||||||
with utf8open(case_loc) as cases_file:
|
with utf8open(case_loc) as cases_file:
|
||||||
for line in cases_file:
|
for line in cases_file:
|
||||||
|
@ -42,46 +42,3 @@ def read_tokenization(lang):
|
||||||
seen.add(chunk)
|
seen.add(chunk)
|
||||||
entries.append((chunk, lex, pieces))
|
entries.append((chunk, lex, pieces))
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
def load_browns(self, data_dir):
|
|
||||||
cdef Lexeme* w
|
|
||||||
case_stats = load_case_stats(data_dir)
|
|
||||||
brown_loc = path.join(data_dir, 'bllip-clusters')
|
|
||||||
assert path.exists(brown_loc)
|
|
||||||
cdef size_t start
|
|
||||||
cdef int end
|
|
||||||
with utf8open(brown_loc) as browns_file:
|
|
||||||
for i, line in enumerate(browns_file):
|
|
||||||
cluster_str, word, freq_str = line.split()
|
|
||||||
# Decode as a little-endian string, so that we can do & 15 to get
|
|
||||||
# the first 4 bits. See redshift._parse_features.pyx
|
|
||||||
cluster = int(cluster_str[::-1], 2)
|
|
||||||
upper_pc, title_pc = case_stats.get(word.lower(), (0.0, 0.0))
|
|
||||||
start = 0
|
|
||||||
end = -1
|
|
||||||
find_slice(&start, &end, word)
|
|
||||||
print "Load", repr(word), start, end
|
|
||||||
w = <Lexeme*>init_word(word, start, end, cluster,
|
|
||||||
upper_pc, title_pc, int(freq_str))
|
|
||||||
self.words[_hash_str(word)] = <size_t>w
|
|
||||||
self.strings[<size_t>w] = word
|
|
||||||
|
|
||||||
def load_clitics(self, data_dir):
|
|
||||||
cdef unicode orig_str
|
|
||||||
cdef unicode clitic
|
|
||||||
for orig_str, norm_form, clitic_strs in util.load_clitics(data_dir):
|
|
||||||
w = init_clitic(orig_str, <Lexeme*>self.lookup_slice(norm_form, 0, -1))
|
|
||||||
self.words[w.orig] = <size_t>w
|
|
||||||
self.strings[<size_t>w] = orig_str
|
|
||||||
assert len(clitic_strs) < MAX_CLITICS
|
|
||||||
assert clitic_strs
|
|
||||||
for i, clitic in enumerate(clitic_strs):
|
|
||||||
# If we write punctuation here, assume we want to keep it,
|
|
||||||
# so tell it the slice boundaries (the full string)
|
|
||||||
w.clitics[i] = self.lookup_slice(clitic, 0, -1)
|
|
||||||
# Ensure we null terminate
|
|
||||||
w.clitics[i+1] = 0
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user