Merge branch 'master' of github.com:explosion/spaCy

This commit is contained in:
Roman Domrachev 2017-11-15 18:30:04 +03:00
commit b3311100c7
17 changed files with 193 additions and 128 deletions

View File

@ -29,7 +29,7 @@ def main(vectors_loc, lang=None):
nr_row, nr_dim = header.split() nr_row, nr_dim = header.split()
nlp.vocab.reset_vectors(width=int(nr_dim)) nlp.vocab.reset_vectors(width=int(nr_dim))
for line in file_: for line in file_:
line = line.decode('utf8') line = line.rstrip().decode('utf8')
pieces = line.rsplit(' ', nr_dim) pieces = line.rsplit(' ', nr_dim)
word = pieces[0] word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')

View File

@ -3,7 +3,7 @@ pathlib
numpy>=1.7 numpy>=1.7
cymem>=1.30,<1.32 cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0 preshed>=1.0.0,<2.0.0
thinc>=6.10.0,<6.11.0 thinc>=6.10.1,<6.11.0
murmurhash>=0.28,<0.29 murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6 plac<1.0.0,>=0.9.6
six six

View File

@ -190,7 +190,7 @@ def setup_package():
'murmurhash>=0.28,<0.29', 'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32', 'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0', 'preshed>=1.0.0,<2.0.0',
'thinc>=6.10.0,<6.11.0', 'thinc>=6.10.1,<6.11.0',
'plac<1.0.0,>=0.9.6', 'plac<1.0.0,>=0.9.6',
'six', 'six',
'pathlib', 'pathlib',

View File

@ -3,7 +3,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy' __title__ = 'spacy'
__version__ = '2.0.2' __version__ = '2.0.3'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io' __uri__ = 'https://spacy.io'
__author__ = 'Explosion AI' __author__ = 'Explosion AI'

View File

@ -11,6 +11,7 @@ import spacy
import sys import sys
import tqdm import tqdm
import cytoolz import cytoolz
import thinc.extra.datasets
def read_inputs(loc): def read_inputs(loc):
@ -32,14 +33,18 @@ def profile(cmd, lang, inputs=None):
""" """
Profile a spaCy pipeline, to find out which functions take the most time. Profile a spaCy pipeline, to find out which functions take the most time.
""" """
if inputs is None:
imdb_train, _ = thinc.extra.datasets.imdb()
inputs, _ = zip(*imdb_train)
inputs = inputs[:2000]
nlp = spacy.load(lang) nlp = spacy.load(lang)
texts = list(cytoolz.take(10000, inputs)) texts = list(cytoolz.take(10000, inputs))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
"Profile.prof") "Profile.prof")
s = pstats.Stats("Profile.prof") s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats() s.strip_dirs().sort_stats("cumtime").print_stats()
def parse_texts(nlp, texts): def parse_texts(nlp, texts):
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128): for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
pass pass

View File

@ -179,7 +179,7 @@ class GoldCorpus(object):
gold_tuples = read_json_file(loc) gold_tuples = read_json_file(loc)
for item in gold_tuples: for item in gold_tuples:
yield item yield item
i += 1 i += len(item[1])
if self.limit and i >= self.limit: if self.limit and i >= self.limit:
break break

View File

@ -21,8 +21,25 @@ class JapaneseTokenizer(object):
words = [x.surface for x in self.tokenizer.tokenize(text)] words = [x.surface for x in self.tokenizer.tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)
def to_bytes(self, **exclude):
return b''
def from_bytes(self, bytes_data, **exclude):
return self
def to_disk(self, path, **exclude):
return None
def from_disk(self, path, **exclude):
return self
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ja'
@classmethod @classmethod
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp) return JapaneseTokenizer(cls, nlp)

View File

@ -9,36 +9,31 @@ from thinc.typedefs cimport hash_t, class_t
from thinc.extra.search cimport MaxViolation from thinc.extra.search cimport MaxViolation
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition
from .stateclass cimport StateClass
from ..gold cimport GoldParse from ..gold cimport GoldParse
from .stateclass cimport StateC, StateClass
# These are passed as callbacks to thinc.search.Beam # These are passed as callbacks to thinc.search.Beam
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
dest = <StateClass>_dest dest = <StateC*>_dest
src = <StateClass>_src src = <StateC*>_src
moves = <const Transition*>_moves moves = <const Transition*>_moves
dest.clone(src) dest.clone(src)
moves[clas].do(dest.c, moves[clas].label) moves[clas].do(dest, moves[clas].label)
dest.c.push_hist(clas) dest.push_hist(clas)
cdef int _check_final_state(void* _state, void* extra_args) except -1: cdef int _check_final_state(void* _state, void* extra_args) except -1:
return (<StateClass>_state).is_final() state = <StateC*>_state
return state.is_final()
def _cleanup(Beam beam):
for i in range(beam.width):
Py_XDECREF(<PyObject*>beam._states[i].content)
Py_XDECREF(<PyObject*>beam._parents[i].content)
cdef hash_t _hash_state(void* _state, void* _) except 0: cdef hash_t _hash_state(void* _state, void* _) except 0:
state = <StateClass>_state state = <StateC*>_state
if state.c.is_final(): if state.is_final():
return 1 return 1
else: else:
return state.c.hash() return state.hash()
cdef class ParserBeam(object): cdef class ParserBeam(object):
@ -55,14 +50,15 @@ cdef class ParserBeam(object):
self.golds = golds self.golds = golds
self.beams = [] self.beams = []
cdef Beam beam cdef Beam beam
cdef StateClass state, st cdef StateClass state
cdef StateC* st
for state in states: for state in states:
beam = Beam(self.moves.n_moves, width, density) beam = Beam(self.moves.n_moves, width, density)
beam.initialize(self.moves.init_beam_state, state.c.length, beam.initialize(self.moves.init_beam_state, state.c.length,
state.c._sent) state.c._sent)
for i in range(beam.width): for i in range(beam.width):
st = <StateClass>beam.at(i) st = <StateC*>beam.at(i)
st.c.offset = state.c.offset st.offset = state.c.offset
self.beams.append(beam) self.beams.append(beam)
self.dones = [False] * len(self.beams) self.dones = [False] * len(self.beams)
@ -85,14 +81,12 @@ cdef class ParserBeam(object):
self._set_scores(beam, scores[i]) self._set_scores(beam, scores[i])
if self.golds is not None: if self.golds is not None:
self._set_costs(beam, self.golds[i], follow_gold=follow_gold) self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
if follow_gold: beam.advance(_transition_state, NULL, <void*>self.moves.c)
beam.advance(_transition_state, NULL, <void*>self.moves.c)
else:
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL) beam.check_done(_check_final_state, NULL)
# This handles the non-monotonic stuff for the parser.
if beam.is_done and self.golds is not None: if beam.is_done and self.golds is not None:
for j in range(beam.size): for j in range(beam.size):
state = <StateClass>beam.at(j) state = StateClass.borrow(<StateC*>beam.at(j))
if state.is_final(): if state.is_final():
try: try:
if self.moves.is_gold_parse(state, self.golds[i]): if self.moves.is_gold_parse(state, self.golds[i]):
@ -107,11 +101,11 @@ cdef class ParserBeam(object):
cdef int nr_state = min(scores.shape[0], beam.size) cdef int nr_state = min(scores.shape[0], beam.size)
cdef int nr_class = scores.shape[1] cdef int nr_class = scores.shape[1]
for i in range(nr_state): for i in range(nr_state):
state = <StateClass>beam.at(i) state = <StateC*>beam.at(i)
if not state.is_final(): if not state.is_final():
for j in range(nr_class): for j in range(nr_class):
beam.scores[i][j] = c_scores[i * nr_class + j] beam.scores[i][j] = c_scores[i * nr_class + j]
self.moves.set_valid(beam.is_valid[i], state.c) self.moves.set_valid(beam.is_valid[i], state)
else: else:
for j in range(beam.nr_class): for j in range(beam.nr_class):
beam.scores[i][j] = 0 beam.scores[i][j] = 0
@ -119,8 +113,8 @@ cdef class ParserBeam(object):
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False): def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
for i in range(beam.size): for i in range(beam.size):
state = <StateClass>beam.at(i) state = StateClass.borrow(<StateC*>beam.at(i))
if not state.c.is_final(): if not state.is_final():
self.moves.set_costs(beam.is_valid[i], beam.costs[i], self.moves.set_costs(beam.is_valid[i], beam.costs[i],
state, gold) state, gold)
if follow_gold: if follow_gold:
@ -157,7 +151,7 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
pbeam = ParserBeam(moves, states, golds, pbeam = ParserBeam(moves, states, golds,
width=width, density=density) width=width, density=density)
gbeam = ParserBeam(moves, states, golds, gbeam = ParserBeam(moves, states, golds,
width=width, density=0.0) width=width, density=density)
cdef StateClass state cdef StateClass state
beam_maps = [] beam_maps = []
backprops = [] backprops = []
@ -231,7 +225,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
p_indices.append([]) p_indices.append([])
g_indices.append([]) g_indices.append([])
for i in range(pbeam.size): for i in range(pbeam.size):
state = <StateClass>pbeam.at(i) state = StateClass.borrow(<StateC*>pbeam.at(i))
if not state.is_final(): if not state.is_final():
key = tuple([eg_id] + pbeam.histories[i]) key = tuple([eg_id] + pbeam.histories[i])
assert key not in seen, (key, seen) assert key not in seen, (key, seen)
@ -240,7 +234,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
states.append(state) states.append(state)
beam_map.update(seen) beam_map.update(seen)
for i in range(gbeam.size): for i in range(gbeam.size):
state = <StateClass>gbeam.at(i) state = StateClass.borrow(<StateC*>gbeam.at(i))
if not state.is_final(): if not state.is_final():
key = tuple([eg_id] + gbeam.histories[i]) key = tuple([eg_id] + gbeam.histories[i])
if key in seen: if key in seen:

View File

@ -292,12 +292,16 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length) st = new StateC(<const TokenC*>tokens, length)
for i in range(st.c.length): for i in range(st.length):
st.c._sent[i].l_edge = i if st._sent[i].dep == 0:
st.c._sent[i].r_edge = i st._sent[i].l_edge = i
st._sent[i].r_edge = i
st._sent[i].head = 0
st._sent[i].dep = 0
st._sent[i].l_kids = 0
st._sent[i].r_kids = 0
st.fast_forward() st.fast_forward()
Py_INCREF(st)
return <void*>st return <void*>st
@ -533,18 +537,18 @@ cdef class ArcEager(TransitionSystem):
assert n_gold >= 1 assert n_gold >= 1
def get_beam_annot(self, Beam beam): def get_beam_annot(self, Beam beam):
length = (<StateClass>beam.at(0)).c.length length = (<StateC*>beam.at(0)).length
heads = [{} for _ in range(length)] heads = [{} for _ in range(length)]
deps = [{} for _ in range(length)] deps = [{} for _ in range(length)]
probs = beam.probs probs = beam.probs
for i in range(beam.size): for i in range(beam.size):
stcls = <StateClass>beam.at(i) state = <StateC*>beam.at(i)
self.finalize_state(stcls.c) self.finalize_state(state)
if stcls.is_final(): if state.is_final():
prob = probs[i] prob = probs[i]
for j in range(stcls.c.length): for j in range(state.length):
head = j + stcls.c._sent[j].head head = j + state._sent[j].head
dep = stcls.c._sent[j].dep dep = state._sent[j].dep
heads[j].setdefault(head, 0.0) heads[j].setdefault(head, 0.0)
heads[j][head] += prob heads[j][head] += prob
deps[j].setdefault(dep, 0.0) deps[j].setdefault(dep, 0.0)

View File

@ -123,14 +123,14 @@ cdef class BiluoPushDown(TransitionSystem):
entities = {} entities = {}
probs = beam.probs probs = beam.probs
for i in range(beam.size): for i in range(beam.size):
stcls = <StateClass>beam.at(i) state = <StateC*>beam.at(i)
if stcls.is_final(): if state.is_final():
self.finalize_state(stcls.c) self.finalize_state(state)
prob = probs[i] prob = probs[i]
for j in range(stcls.c._e_i): for j in range(state._e_i):
start = stcls.c._ents[j].start start = state._ents[j].start
end = stcls.c._ents[j].end end = state._ents[j].end
label = stcls.c._ents[j].label label = state._ents[j].label
entities.setdefault((start, end, label), 0.0) entities.setdefault((start, end, label), 0.0)
entities[(start, end, label)] += prob entities[(start, end, label)] += prob
return entities return entities
@ -139,15 +139,15 @@ cdef class BiluoPushDown(TransitionSystem):
parses = [] parses = []
probs = beam.probs probs = beam.probs
for i in range(beam.size): for i in range(beam.size):
stcls = <StateClass>beam.at(i) state = <StateC*>beam.at(i)
if stcls.is_final(): if state.is_final():
self.finalize_state(stcls.c) self.finalize_state(state)
prob = probs[i] prob = probs[i]
parse = [] parse = []
for j in range(stcls.c._e_i): for j in range(state._e_i):
start = stcls.c._ents[j].start start = state._ents[j].start
end = stcls.c._ents[j].end end = state._ents[j].end
label = stcls.c._ents[j].label label = state._ents[j].label
parse.append((start, end, self.strings[label])) parse.append((start, end, self.strings[label]))
parses.append((prob, parse)) parses.append((prob, parse))
return parses return parses

View File

@ -17,7 +17,7 @@ from cpython.ref cimport PyObject, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.math cimport exp from libc.math cimport exp
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.string cimport memset from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t, class_t, hash_t from thinc.typedefs cimport weight_t, class_t, hash_t
@ -224,6 +224,16 @@ cdef void cpu_regression_loss(float* d_scores,
d_scores[i] = diff d_scores[i] = diff
def _collect_states(beams):
cdef StateClass state
cdef Beam beam
states = []
for beam in beams:
state = StateClass.borrow(<StateC*>beam.at(0))
states.append(state)
return states
cdef class Parser: cdef class Parser:
""" """
Base class of the DependencyParser and EntityRecognizer. Base class of the DependencyParser and EntityRecognizer.
@ -336,7 +346,7 @@ cdef class Parser:
beam_density=beam_density) beam_density=beam_density)
beam = beams[0] beam = beams[0]
output = self.moves.get_beam_annot(beam) output = self.moves.get_beam_annot(beam)
state = <StateClass>beam.at(0) state = StateClass.borrow(<StateC*>beam.at(0))
self.set_annotations([doc], [state], tensors=tokvecs) self.set_annotations([doc], [state], tensors=tokvecs)
_cleanup(beam) _cleanup(beam)
return output return output
@ -356,10 +366,10 @@ cdef class Parser:
if beam_density is None: if beam_density is None:
beam_density = self.cfg.get('beam_density', 0.0) beam_density = self.cfg.get('beam_density', 0.0)
cdef Doc doc cdef Doc doc
cdef Beam beam
for batch in cytoolz.partition_all(batch_size, docs): for batch in cytoolz.partition_all(batch_size, docs):
batch = list(batch) batch_in_order = list(batch)
by_length = sorted(list(batch), key=lambda doc: len(doc)) by_length = sorted(batch_in_order, key=lambda doc: len(doc))
batch_beams = []
for subbatch in cytoolz.partition_all(8, by_length): for subbatch in cytoolz.partition_all(8, by_length):
subbatch = list(subbatch) subbatch = list(subbatch)
if beam_width == 1: if beam_width == 1:
@ -369,21 +379,20 @@ cdef class Parser:
beams, tokvecs = self.beam_parse(subbatch, beams, tokvecs = self.beam_parse(subbatch,
beam_width=beam_width, beam_width=beam_width,
beam_density=beam_density) beam_density=beam_density)
parse_states = [] parse_states = _collect_states(beams)
for beam in beams: self.set_annotations(subbatch, parse_states, tensors=None)
parse_states.append(<StateClass>beam.at(0)) for beam in beams:
self.set_annotations(subbatch, parse_states, tensors=tokvecs) _cleanup(beam)
yield from batch for doc in batch_in_order:
for beam in beams: yield doc
_cleanup(beam)
def parse_batch(self, docs): def parse_batch(self, docs):
cdef: cdef:
precompute_hiddens state2vec precompute_hiddens state2vec
StateClass stcls
Pool mem Pool mem
const float* feat_weights const float* feat_weights
StateC* st StateC* st
StateClass stcls
vector[StateC*] states vector[StateC*] states
int guess, nr_class, nr_feat, nr_piece, nr_dim, nr_state, nr_step int guess, nr_class, nr_feat, nr_piece, nr_dim, nr_state, nr_step
int j int j
@ -476,50 +485,59 @@ cdef class Parser:
cdef np.ndarray scores cdef np.ndarray scores
cdef Doc doc cdef Doc doc
cdef int nr_class = self.moves.n_moves cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output
cuda_stream = util.get_cuda_stream() cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
docs, cuda_stream, 0.0) docs, cuda_stream, 0.0)
beams = []
cdef int offset = 0 cdef int offset = 0
cdef int j = 0 cdef int j = 0
cdef int k cdef int k
beams = []
for doc in docs: for doc in docs:
beam = Beam(nr_class, beam_width, min_density=beam_density) beam = Beam(nr_class, beam_width, min_density=beam_density)
beam.initialize(self.moves.init_beam_state, doc.length, doc.c) beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
for i in range(beam.width): for i in range(beam.width):
stcls = <StateClass>beam.at(i) state = <StateC*>beam.at(i)
stcls.c.offset = offset state.offset = offset
offset += len(doc) offset += len(doc)
beam.check_done(_check_final_state, NULL) beam.check_done(_check_final_state, NULL)
while not beam.is_done: beams.append(beam)
states = [] cdef np.ndarray token_ids
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
dtype='i', order='C')
todo = [beam for beam in beams if not beam.is_done]
cdef int* c_ids
cdef int nr_feature = self.nr_feature
cdef int n_states
while todo:
todo = [beam for beam in beams if not beam.is_done]
token_ids.fill(-1)
c_ids = <int*>token_ids.data
n_states = 0
for beam in todo:
for i in range(beam.size): for i in range(beam.size):
stcls = <StateClass>beam.at(i) state = <StateC*>beam.at(i)
# This way we avoid having to score finalized states # This way we avoid having to score finalized states
# We do have to take care to keep indexes aligned, though # We do have to take care to keep indexes aligned, though
if not stcls.is_final(): if not state.is_final():
states.append(stcls) state.set_context_tokens(c_ids, nr_feature)
token_ids = self.get_token_ids(states) c_ids += nr_feature
vectors = state2vec(token_ids) n_states += 1
if self.cfg.get('hist_size', 0): if n_states == 0:
hists = numpy.asarray([st.history[:self.cfg['hist_size']] break
for st in states], dtype='i') vectors = state2vec(token_ids[:n_states])
scores = vec2scores((vectors, hists)) scores = vec2scores(vectors)
else: c_scores = <float*>scores.data
scores = vec2scores(vectors) for beam in todo:
j = 0
c_scores = <float*>scores.data
for i in range(beam.size): for i in range(beam.size):
stcls = <StateClass>beam.at(i) state = <StateC*>beam.at(i)
if not stcls.is_final(): if not state.is_final():
self.moves.set_valid(beam.is_valid[i], stcls.c) self.moves.set_valid(beam.is_valid[i], state)
for k in range(nr_class): memcpy(beam.scores[i], c_scores, nr_class * sizeof(float))
beam.scores[i][k] = c_scores[j * scores.shape[1] + k] c_scores += nr_class
j += 1 beam.advance(_transition_state, NULL, <void*>self.moves.c)
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL) beam.check_done(_check_final_state, NULL)
beams.append(beam)
tokvecs = self.model[0].ops.unflatten(tokvecs, tokvecs = self.model[0].ops.unflatten(tokvecs,
[len(doc) for doc in docs]) [len(doc) for doc in docs])
return beams, tokvecs return beams, tokvecs
@ -527,7 +545,7 @@ cdef class Parser:
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds): if not any(self.moves.has_gold(gold) for gold in golds):
return None return None
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
return self.update_beam(docs, golds, return self.update_beam(docs, golds,
self.cfg['beam_width'], self.cfg['beam_density'], self.cfg['beam_width'], self.cfg['beam_density'],
drop=drop, sgd=sgd, losses=losses) drop=drop, sgd=sgd, losses=losses)
@ -965,27 +983,40 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
# These are passed as callbacks to thinc.search.Beam # These are passed as callbacks to thinc.search.Beam
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
dest = <StateClass>_dest dest = <StateC*>_dest
src = <StateClass>_src src = <StateC*>_src
moves = <const Transition*>_moves moves = <const Transition*>_moves
dest.clone(src) dest.clone(src)
moves[clas].do(dest.c, moves[clas].label) moves[clas].do(dest, moves[clas].label)
dest.c.push_hist(clas) dest.push_hist(clas)
cdef int _check_final_state(void* _state, void* extra_args) except -1: cdef int _check_final_state(void* _state, void* extra_args) except -1:
return (<StateClass>_state).is_final() state = <StateC*>_state
return state.is_final()
def _cleanup(Beam beam): def _cleanup(Beam beam):
cdef StateC* state
# Once parsing has finished, states in beam may not be unique. Is this
# correct?
seen = set()
for i in range(beam.width): for i in range(beam.width):
Py_XDECREF(<PyObject*>beam._states[i].content) addr = <size_t>beam._parents[i].content
Py_XDECREF(<PyObject*>beam._parents[i].content) if addr not in seen:
state = <StateC*>addr
del state
cdef hash_t _hash_state(void* _state, void* _) except 0: seen.add(addr)
state = <StateClass>_state else:
if state.c.is_final(): print(i, addr)
return 1 print(seen)
else: raise Exception
return state.c.hash() addr = <size_t>beam._states[i].content
if addr not in seen:
state = <StateC*>addr
del state
seen.add(addr)
else:
print(i, addr)
print(seen)
raise Exception

View File

@ -13,12 +13,22 @@ from ._state cimport StateC
cdef class StateClass: cdef class StateClass:
cdef Pool mem cdef Pool mem
cdef StateC* c cdef StateC* c
cdef int _borrowed
@staticmethod @staticmethod
cdef inline StateClass init(const TokenC* sent, int length): cdef inline StateClass init(const TokenC* sent, int length):
cdef StateClass self = StateClass() cdef StateClass self = StateClass()
self.c = new StateC(sent, length) self.c = new StateC(sent, length)
return self return self
@staticmethod
cdef inline StateClass borrow(StateC* ptr):
cdef StateClass self = StateClass()
del self.c
self.c = ptr
self._borrowed = 1
return self
@staticmethod @staticmethod
cdef inline StateClass init_offset(const TokenC* sent, int length, int cdef inline StateClass init_offset(const TokenC* sent, int length, int

View File

@ -11,12 +11,14 @@ cdef class StateClass:
def __init__(self, Doc doc=None, int offset=0): def __init__(self, Doc doc=None, int offset=0):
cdef Pool mem = Pool() cdef Pool mem = Pool()
self.mem = mem self.mem = mem
self._borrowed = 0
if doc is not None: if doc is not None:
self.c = new StateC(doc.c, doc.length) self.c = new StateC(doc.c, doc.length)
self.c.offset = offset self.c.offset = offset
def __dealloc__(self): def __dealloc__(self):
del self.c if self._borrowed != 1:
del self.c
@property @property
def stack(self): def stack(self):

View File

@ -23,8 +23,7 @@ class OracleError(Exception):
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length) cdef StateC* st = new StateC(<const TokenC*>tokens, length)
Py_INCREF(st)
return <void*>st return <void*>st

View File

@ -1,4 +1,5 @@
# cython: embedsignature=True # cython: embedsignature=True
# cython: profile=True
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
@ -274,7 +275,7 @@ cdef class Tokenizer:
int has_special, int n) except -1: int has_special, int n) except -1:
cdef int i cdef int i
for i in range(n): for i in range(n):
if tokens[i].lex.id == 0: if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL:
return 0 return 0
# See https://github.com/explosion/spaCy/issues/1250 # See https://github.com/explosion/spaCy/issues/1250
if has_special: if has_special:

View File

@ -1,4 +1,5 @@
# coding: utf8 # coding: utf8
# cython: profile=True
from __future__ import unicode_literals from __future__ import unicode_literals
import numpy import numpy
@ -154,7 +155,10 @@ cdef class Vocab:
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
lex.orth = self.strings.add(string) lex.orth = self.strings.add(string)
lex.length = len(string) lex.length = len(string)
lex.id = self.length if self.vectors is not None:
lex.id = self.vectors.key2row.get(lex.orth, 0)
else:
lex.id = 0
if self.lex_attr_getters is not None: if self.lex_attr_getters is not None:
for attr, func in self.lex_attr_getters.items(): for attr, func in self.lex_attr_getters.items():
value = func(string) value = func(string)
@ -164,9 +168,7 @@ cdef class Vocab:
lex.prob = value lex.prob = value
elif value is not None: elif value is not None:
Lexeme.set_struct_attr(lex, attr, value) Lexeme.set_struct_attr(lex, attr, value)
if is_oov: if not is_oov:
lex.id = 0
else:
key = hash_string(string) key = hash_string(string)
self._add_lex_to_vocab(key, lex) self._add_lex_to_vocab(key, lex)
assert lex != NULL, string assert lex != NULL, string

View File

@ -257,7 +257,7 @@ p
+row +row
+cell #[code dev_data] +cell #[code dev_data]
+cell positional +cell positional
+cell Location of JSON-formatted dev data (optional). +cell Location of JSON-formatted development data for evaluation.
+row +row
+cell #[code --n-iter], #[code -n] +cell #[code --n-iter], #[code -n]