mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge branch 'master' of github.com:explosion/spaCy
This commit is contained in:
commit
b3311100c7
|
@ -29,7 +29,7 @@ def main(vectors_loc, lang=None):
|
||||||
nr_row, nr_dim = header.split()
|
nr_row, nr_dim = header.split()
|
||||||
nlp.vocab.reset_vectors(width=int(nr_dim))
|
nlp.vocab.reset_vectors(width=int(nr_dim))
|
||||||
for line in file_:
|
for line in file_:
|
||||||
line = line.decode('utf8')
|
line = line.rstrip().decode('utf8')
|
||||||
pieces = line.rsplit(' ', nr_dim)
|
pieces = line.rsplit(' ', nr_dim)
|
||||||
word = pieces[0]
|
word = pieces[0]
|
||||||
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
|
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
|
||||||
|
|
|
@ -3,7 +3,7 @@ pathlib
|
||||||
numpy>=1.7
|
numpy>=1.7
|
||||||
cymem>=1.30,<1.32
|
cymem>=1.30,<1.32
|
||||||
preshed>=1.0.0,<2.0.0
|
preshed>=1.0.0,<2.0.0
|
||||||
thinc>=6.10.0,<6.11.0
|
thinc>=6.10.1,<6.11.0
|
||||||
murmurhash>=0.28,<0.29
|
murmurhash>=0.28,<0.29
|
||||||
plac<1.0.0,>=0.9.6
|
plac<1.0.0,>=0.9.6
|
||||||
six
|
six
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -190,7 +190,7 @@ def setup_package():
|
||||||
'murmurhash>=0.28,<0.29',
|
'murmurhash>=0.28,<0.29',
|
||||||
'cymem>=1.30,<1.32',
|
'cymem>=1.30,<1.32',
|
||||||
'preshed>=1.0.0,<2.0.0',
|
'preshed>=1.0.0,<2.0.0',
|
||||||
'thinc>=6.10.0,<6.11.0',
|
'thinc>=6.10.1,<6.11.0',
|
||||||
'plac<1.0.0,>=0.9.6',
|
'plac<1.0.0,>=0.9.6',
|
||||||
'six',
|
'six',
|
||||||
'pathlib',
|
'pathlib',
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
__title__ = 'spacy'
|
__title__ = 'spacy'
|
||||||
__version__ = '2.0.2'
|
__version__ = '2.0.3'
|
||||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Explosion AI'
|
__author__ = 'Explosion AI'
|
||||||
|
|
|
@ -11,6 +11,7 @@ import spacy
|
||||||
import sys
|
import sys
|
||||||
import tqdm
|
import tqdm
|
||||||
import cytoolz
|
import cytoolz
|
||||||
|
import thinc.extra.datasets
|
||||||
|
|
||||||
|
|
||||||
def read_inputs(loc):
|
def read_inputs(loc):
|
||||||
|
@ -32,14 +33,18 @@ def profile(cmd, lang, inputs=None):
|
||||||
"""
|
"""
|
||||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||||
"""
|
"""
|
||||||
|
if inputs is None:
|
||||||
|
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||||
|
inputs, _ = zip(*imdb_train)
|
||||||
|
inputs = inputs[:2000]
|
||||||
nlp = spacy.load(lang)
|
nlp = spacy.load(lang)
|
||||||
texts = list(cytoolz.take(10000, inputs))
|
texts = list(cytoolz.take(10000, inputs))
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
|
||||||
"Profile.prof")
|
"Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
s.strip_dirs().sort_stats("time").print_stats()
|
s.strip_dirs().sort_stats("cumtime").print_stats()
|
||||||
|
|
||||||
|
|
||||||
def parse_texts(nlp, texts):
|
def parse_texts(nlp, texts):
|
||||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -179,7 +179,7 @@ class GoldCorpus(object):
|
||||||
gold_tuples = read_json_file(loc)
|
gold_tuples = read_json_file(loc)
|
||||||
for item in gold_tuples:
|
for item in gold_tuples:
|
||||||
yield item
|
yield item
|
||||||
i += 1
|
i += len(item[1])
|
||||||
if self.limit and i >= self.limit:
|
if self.limit and i >= self.limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
@ -21,8 +21,25 @@ class JapaneseTokenizer(object):
|
||||||
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
||||||
|
# allow serialization (see #1557)
|
||||||
|
def to_bytes(self, **exclude):
|
||||||
|
return b''
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path, **exclude):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def from_disk(self, path, **exclude):
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: 'ja'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
return JapaneseTokenizer(cls, nlp)
|
return JapaneseTokenizer(cls, nlp)
|
||||||
|
|
|
@ -9,36 +9,31 @@ from thinc.typedefs cimport hash_t, class_t
|
||||||
from thinc.extra.search cimport MaxViolation
|
from thinc.extra.search cimport MaxViolation
|
||||||
|
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
from .stateclass cimport StateClass
|
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
|
from .stateclass cimport StateC, StateClass
|
||||||
|
|
||||||
|
|
||||||
# These are passed as callbacks to thinc.search.Beam
|
# These are passed as callbacks to thinc.search.Beam
|
||||||
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||||
dest = <StateClass>_dest
|
dest = <StateC*>_dest
|
||||||
src = <StateClass>_src
|
src = <StateC*>_src
|
||||||
moves = <const Transition*>_moves
|
moves = <const Transition*>_moves
|
||||||
dest.clone(src)
|
dest.clone(src)
|
||||||
moves[clas].do(dest.c, moves[clas].label)
|
moves[clas].do(dest, moves[clas].label)
|
||||||
dest.c.push_hist(clas)
|
dest.push_hist(clas)
|
||||||
|
|
||||||
|
|
||||||
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
||||||
return (<StateClass>_state).is_final()
|
state = <StateC*>_state
|
||||||
|
return state.is_final()
|
||||||
|
|
||||||
def _cleanup(Beam beam):
|
|
||||||
for i in range(beam.width):
|
|
||||||
Py_XDECREF(<PyObject*>beam._states[i].content)
|
|
||||||
Py_XDECREF(<PyObject*>beam._parents[i].content)
|
|
||||||
|
|
||||||
|
|
||||||
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
||||||
state = <StateClass>_state
|
state = <StateC*>_state
|
||||||
if state.c.is_final():
|
if state.is_final():
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
return state.c.hash()
|
return state.hash()
|
||||||
|
|
||||||
|
|
||||||
cdef class ParserBeam(object):
|
cdef class ParserBeam(object):
|
||||||
|
@ -55,14 +50,15 @@ cdef class ParserBeam(object):
|
||||||
self.golds = golds
|
self.golds = golds
|
||||||
self.beams = []
|
self.beams = []
|
||||||
cdef Beam beam
|
cdef Beam beam
|
||||||
cdef StateClass state, st
|
cdef StateClass state
|
||||||
|
cdef StateC* st
|
||||||
for state in states:
|
for state in states:
|
||||||
beam = Beam(self.moves.n_moves, width, density)
|
beam = Beam(self.moves.n_moves, width, density)
|
||||||
beam.initialize(self.moves.init_beam_state, state.c.length,
|
beam.initialize(self.moves.init_beam_state, state.c.length,
|
||||||
state.c._sent)
|
state.c._sent)
|
||||||
for i in range(beam.width):
|
for i in range(beam.width):
|
||||||
st = <StateClass>beam.at(i)
|
st = <StateC*>beam.at(i)
|
||||||
st.c.offset = state.c.offset
|
st.offset = state.c.offset
|
||||||
self.beams.append(beam)
|
self.beams.append(beam)
|
||||||
self.dones = [False] * len(self.beams)
|
self.dones = [False] * len(self.beams)
|
||||||
|
|
||||||
|
@ -85,14 +81,12 @@ cdef class ParserBeam(object):
|
||||||
self._set_scores(beam, scores[i])
|
self._set_scores(beam, scores[i])
|
||||||
if self.golds is not None:
|
if self.golds is not None:
|
||||||
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
|
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
|
||||||
if follow_gold:
|
beam.advance(_transition_state, NULL, <void*>self.moves.c)
|
||||||
beam.advance(_transition_state, NULL, <void*>self.moves.c)
|
|
||||||
else:
|
|
||||||
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
|
||||||
beam.check_done(_check_final_state, NULL)
|
beam.check_done(_check_final_state, NULL)
|
||||||
|
# This handles the non-monotonic stuff for the parser.
|
||||||
if beam.is_done and self.golds is not None:
|
if beam.is_done and self.golds is not None:
|
||||||
for j in range(beam.size):
|
for j in range(beam.size):
|
||||||
state = <StateClass>beam.at(j)
|
state = StateClass.borrow(<StateC*>beam.at(j))
|
||||||
if state.is_final():
|
if state.is_final():
|
||||||
try:
|
try:
|
||||||
if self.moves.is_gold_parse(state, self.golds[i]):
|
if self.moves.is_gold_parse(state, self.golds[i]):
|
||||||
|
@ -107,11 +101,11 @@ cdef class ParserBeam(object):
|
||||||
cdef int nr_state = min(scores.shape[0], beam.size)
|
cdef int nr_state = min(scores.shape[0], beam.size)
|
||||||
cdef int nr_class = scores.shape[1]
|
cdef int nr_class = scores.shape[1]
|
||||||
for i in range(nr_state):
|
for i in range(nr_state):
|
||||||
state = <StateClass>beam.at(i)
|
state = <StateC*>beam.at(i)
|
||||||
if not state.is_final():
|
if not state.is_final():
|
||||||
for j in range(nr_class):
|
for j in range(nr_class):
|
||||||
beam.scores[i][j] = c_scores[i * nr_class + j]
|
beam.scores[i][j] = c_scores[i * nr_class + j]
|
||||||
self.moves.set_valid(beam.is_valid[i], state.c)
|
self.moves.set_valid(beam.is_valid[i], state)
|
||||||
else:
|
else:
|
||||||
for j in range(beam.nr_class):
|
for j in range(beam.nr_class):
|
||||||
beam.scores[i][j] = 0
|
beam.scores[i][j] = 0
|
||||||
|
@ -119,8 +113,8 @@ cdef class ParserBeam(object):
|
||||||
|
|
||||||
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
|
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
state = <StateClass>beam.at(i)
|
state = StateClass.borrow(<StateC*>beam.at(i))
|
||||||
if not state.c.is_final():
|
if not state.is_final():
|
||||||
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
|
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
|
||||||
state, gold)
|
state, gold)
|
||||||
if follow_gold:
|
if follow_gold:
|
||||||
|
@ -157,7 +151,7 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||||
pbeam = ParserBeam(moves, states, golds,
|
pbeam = ParserBeam(moves, states, golds,
|
||||||
width=width, density=density)
|
width=width, density=density)
|
||||||
gbeam = ParserBeam(moves, states, golds,
|
gbeam = ParserBeam(moves, states, golds,
|
||||||
width=width, density=0.0)
|
width=width, density=density)
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
beam_maps = []
|
beam_maps = []
|
||||||
backprops = []
|
backprops = []
|
||||||
|
@ -231,7 +225,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
|
||||||
p_indices.append([])
|
p_indices.append([])
|
||||||
g_indices.append([])
|
g_indices.append([])
|
||||||
for i in range(pbeam.size):
|
for i in range(pbeam.size):
|
||||||
state = <StateClass>pbeam.at(i)
|
state = StateClass.borrow(<StateC*>pbeam.at(i))
|
||||||
if not state.is_final():
|
if not state.is_final():
|
||||||
key = tuple([eg_id] + pbeam.histories[i])
|
key = tuple([eg_id] + pbeam.histories[i])
|
||||||
assert key not in seen, (key, seen)
|
assert key not in seen, (key, seen)
|
||||||
|
@ -240,7 +234,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
|
||||||
states.append(state)
|
states.append(state)
|
||||||
beam_map.update(seen)
|
beam_map.update(seen)
|
||||||
for i in range(gbeam.size):
|
for i in range(gbeam.size):
|
||||||
state = <StateClass>gbeam.at(i)
|
state = StateClass.borrow(<StateC*>gbeam.at(i))
|
||||||
if not state.is_final():
|
if not state.is_final():
|
||||||
key = tuple([eg_id] + gbeam.histories[i])
|
key = tuple([eg_id] + gbeam.histories[i])
|
||||||
if key in seen:
|
if key in seen:
|
||||||
|
|
|
@ -292,12 +292,16 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
||||||
|
|
||||||
|
|
||||||
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
||||||
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
|
st = new StateC(<const TokenC*>tokens, length)
|
||||||
for i in range(st.c.length):
|
for i in range(st.length):
|
||||||
st.c._sent[i].l_edge = i
|
if st._sent[i].dep == 0:
|
||||||
st.c._sent[i].r_edge = i
|
st._sent[i].l_edge = i
|
||||||
|
st._sent[i].r_edge = i
|
||||||
|
st._sent[i].head = 0
|
||||||
|
st._sent[i].dep = 0
|
||||||
|
st._sent[i].l_kids = 0
|
||||||
|
st._sent[i].r_kids = 0
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
Py_INCREF(st)
|
|
||||||
return <void*>st
|
return <void*>st
|
||||||
|
|
||||||
|
|
||||||
|
@ -533,18 +537,18 @@ cdef class ArcEager(TransitionSystem):
|
||||||
assert n_gold >= 1
|
assert n_gold >= 1
|
||||||
|
|
||||||
def get_beam_annot(self, Beam beam):
|
def get_beam_annot(self, Beam beam):
|
||||||
length = (<StateClass>beam.at(0)).c.length
|
length = (<StateC*>beam.at(0)).length
|
||||||
heads = [{} for _ in range(length)]
|
heads = [{} for _ in range(length)]
|
||||||
deps = [{} for _ in range(length)]
|
deps = [{} for _ in range(length)]
|
||||||
probs = beam.probs
|
probs = beam.probs
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
stcls = <StateClass>beam.at(i)
|
state = <StateC*>beam.at(i)
|
||||||
self.finalize_state(stcls.c)
|
self.finalize_state(state)
|
||||||
if stcls.is_final():
|
if state.is_final():
|
||||||
prob = probs[i]
|
prob = probs[i]
|
||||||
for j in range(stcls.c.length):
|
for j in range(state.length):
|
||||||
head = j + stcls.c._sent[j].head
|
head = j + state._sent[j].head
|
||||||
dep = stcls.c._sent[j].dep
|
dep = state._sent[j].dep
|
||||||
heads[j].setdefault(head, 0.0)
|
heads[j].setdefault(head, 0.0)
|
||||||
heads[j][head] += prob
|
heads[j][head] += prob
|
||||||
deps[j].setdefault(dep, 0.0)
|
deps[j].setdefault(dep, 0.0)
|
||||||
|
|
|
@ -123,14 +123,14 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
entities = {}
|
entities = {}
|
||||||
probs = beam.probs
|
probs = beam.probs
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
stcls = <StateClass>beam.at(i)
|
state = <StateC*>beam.at(i)
|
||||||
if stcls.is_final():
|
if state.is_final():
|
||||||
self.finalize_state(stcls.c)
|
self.finalize_state(state)
|
||||||
prob = probs[i]
|
prob = probs[i]
|
||||||
for j in range(stcls.c._e_i):
|
for j in range(state._e_i):
|
||||||
start = stcls.c._ents[j].start
|
start = state._ents[j].start
|
||||||
end = stcls.c._ents[j].end
|
end = state._ents[j].end
|
||||||
label = stcls.c._ents[j].label
|
label = state._ents[j].label
|
||||||
entities.setdefault((start, end, label), 0.0)
|
entities.setdefault((start, end, label), 0.0)
|
||||||
entities[(start, end, label)] += prob
|
entities[(start, end, label)] += prob
|
||||||
return entities
|
return entities
|
||||||
|
@ -139,15 +139,15 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
parses = []
|
parses = []
|
||||||
probs = beam.probs
|
probs = beam.probs
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
stcls = <StateClass>beam.at(i)
|
state = <StateC*>beam.at(i)
|
||||||
if stcls.is_final():
|
if state.is_final():
|
||||||
self.finalize_state(stcls.c)
|
self.finalize_state(state)
|
||||||
prob = probs[i]
|
prob = probs[i]
|
||||||
parse = []
|
parse = []
|
||||||
for j in range(stcls.c._e_i):
|
for j in range(state._e_i):
|
||||||
start = stcls.c._ents[j].start
|
start = state._ents[j].start
|
||||||
end = stcls.c._ents[j].end
|
end = state._ents[j].end
|
||||||
label = stcls.c._ents[j].label
|
label = state._ents[j].label
|
||||||
parse.append((start, end, self.strings[label]))
|
parse.append((start, end, self.strings[label]))
|
||||||
parses.append((prob, parse))
|
parses.append((prob, parse))
|
||||||
return parses
|
return parses
|
||||||
|
|
|
@ -17,7 +17,7 @@ from cpython.ref cimport PyObject, Py_XDECREF
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||||
from libc.math cimport exp
|
from libc.math cimport exp
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t, class_t, hash_t
|
from thinc.typedefs cimport weight_t, class_t, hash_t
|
||||||
|
@ -224,6 +224,16 @@ cdef void cpu_regression_loss(float* d_scores,
|
||||||
d_scores[i] = diff
|
d_scores[i] = diff
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_states(beams):
|
||||||
|
cdef StateClass state
|
||||||
|
cdef Beam beam
|
||||||
|
states = []
|
||||||
|
for beam in beams:
|
||||||
|
state = StateClass.borrow(<StateC*>beam.at(0))
|
||||||
|
states.append(state)
|
||||||
|
return states
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
cdef class Parser:
|
||||||
"""
|
"""
|
||||||
Base class of the DependencyParser and EntityRecognizer.
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
|
@ -336,7 +346,7 @@ cdef class Parser:
|
||||||
beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
beam = beams[0]
|
beam = beams[0]
|
||||||
output = self.moves.get_beam_annot(beam)
|
output = self.moves.get_beam_annot(beam)
|
||||||
state = <StateClass>beam.at(0)
|
state = StateClass.borrow(<StateC*>beam.at(0))
|
||||||
self.set_annotations([doc], [state], tensors=tokvecs)
|
self.set_annotations([doc], [state], tensors=tokvecs)
|
||||||
_cleanup(beam)
|
_cleanup(beam)
|
||||||
return output
|
return output
|
||||||
|
@ -356,10 +366,10 @@ cdef class Parser:
|
||||||
if beam_density is None:
|
if beam_density is None:
|
||||||
beam_density = self.cfg.get('beam_density', 0.0)
|
beam_density = self.cfg.get('beam_density', 0.0)
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef Beam beam
|
|
||||||
for batch in cytoolz.partition_all(batch_size, docs):
|
for batch in cytoolz.partition_all(batch_size, docs):
|
||||||
batch = list(batch)
|
batch_in_order = list(batch)
|
||||||
by_length = sorted(list(batch), key=lambda doc: len(doc))
|
by_length = sorted(batch_in_order, key=lambda doc: len(doc))
|
||||||
|
batch_beams = []
|
||||||
for subbatch in cytoolz.partition_all(8, by_length):
|
for subbatch in cytoolz.partition_all(8, by_length):
|
||||||
subbatch = list(subbatch)
|
subbatch = list(subbatch)
|
||||||
if beam_width == 1:
|
if beam_width == 1:
|
||||||
|
@ -369,21 +379,20 @@ cdef class Parser:
|
||||||
beams, tokvecs = self.beam_parse(subbatch,
|
beams, tokvecs = self.beam_parse(subbatch,
|
||||||
beam_width=beam_width,
|
beam_width=beam_width,
|
||||||
beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
parse_states = []
|
parse_states = _collect_states(beams)
|
||||||
for beam in beams:
|
self.set_annotations(subbatch, parse_states, tensors=None)
|
||||||
parse_states.append(<StateClass>beam.at(0))
|
for beam in beams:
|
||||||
self.set_annotations(subbatch, parse_states, tensors=tokvecs)
|
_cleanup(beam)
|
||||||
yield from batch
|
for doc in batch_in_order:
|
||||||
for beam in beams:
|
yield doc
|
||||||
_cleanup(beam)
|
|
||||||
|
|
||||||
def parse_batch(self, docs):
|
def parse_batch(self, docs):
|
||||||
cdef:
|
cdef:
|
||||||
precompute_hiddens state2vec
|
precompute_hiddens state2vec
|
||||||
StateClass stcls
|
|
||||||
Pool mem
|
Pool mem
|
||||||
const float* feat_weights
|
const float* feat_weights
|
||||||
StateC* st
|
StateC* st
|
||||||
|
StateClass stcls
|
||||||
vector[StateC*] states
|
vector[StateC*] states
|
||||||
int guess, nr_class, nr_feat, nr_piece, nr_dim, nr_state, nr_step
|
int guess, nr_class, nr_feat, nr_piece, nr_dim, nr_state, nr_step
|
||||||
int j
|
int j
|
||||||
|
@ -476,50 +485,59 @@ cdef class Parser:
|
||||||
cdef np.ndarray scores
|
cdef np.ndarray scores
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef int nr_class = self.moves.n_moves
|
cdef int nr_class = self.moves.n_moves
|
||||||
cdef StateClass stcls, output
|
|
||||||
cuda_stream = util.get_cuda_stream()
|
cuda_stream = util.get_cuda_stream()
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||||
docs, cuda_stream, 0.0)
|
docs, cuda_stream, 0.0)
|
||||||
beams = []
|
|
||||||
cdef int offset = 0
|
cdef int offset = 0
|
||||||
cdef int j = 0
|
cdef int j = 0
|
||||||
cdef int k
|
cdef int k
|
||||||
|
|
||||||
|
beams = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
beam = Beam(nr_class, beam_width, min_density=beam_density)
|
beam = Beam(nr_class, beam_width, min_density=beam_density)
|
||||||
beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
|
beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
|
||||||
for i in range(beam.width):
|
for i in range(beam.width):
|
||||||
stcls = <StateClass>beam.at(i)
|
state = <StateC*>beam.at(i)
|
||||||
stcls.c.offset = offset
|
state.offset = offset
|
||||||
offset += len(doc)
|
offset += len(doc)
|
||||||
beam.check_done(_check_final_state, NULL)
|
beam.check_done(_check_final_state, NULL)
|
||||||
while not beam.is_done:
|
beams.append(beam)
|
||||||
states = []
|
cdef np.ndarray token_ids
|
||||||
|
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
||||||
|
dtype='i', order='C')
|
||||||
|
todo = [beam for beam in beams if not beam.is_done]
|
||||||
|
|
||||||
|
cdef int* c_ids
|
||||||
|
cdef int nr_feature = self.nr_feature
|
||||||
|
cdef int n_states
|
||||||
|
while todo:
|
||||||
|
todo = [beam for beam in beams if not beam.is_done]
|
||||||
|
token_ids.fill(-1)
|
||||||
|
c_ids = <int*>token_ids.data
|
||||||
|
n_states = 0
|
||||||
|
for beam in todo:
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
stcls = <StateClass>beam.at(i)
|
state = <StateC*>beam.at(i)
|
||||||
# This way we avoid having to score finalized states
|
# This way we avoid having to score finalized states
|
||||||
# We do have to take care to keep indexes aligned, though
|
# We do have to take care to keep indexes aligned, though
|
||||||
if not stcls.is_final():
|
if not state.is_final():
|
||||||
states.append(stcls)
|
state.set_context_tokens(c_ids, nr_feature)
|
||||||
token_ids = self.get_token_ids(states)
|
c_ids += nr_feature
|
||||||
vectors = state2vec(token_ids)
|
n_states += 1
|
||||||
if self.cfg.get('hist_size', 0):
|
if n_states == 0:
|
||||||
hists = numpy.asarray([st.history[:self.cfg['hist_size']]
|
break
|
||||||
for st in states], dtype='i')
|
vectors = state2vec(token_ids[:n_states])
|
||||||
scores = vec2scores((vectors, hists))
|
scores = vec2scores(vectors)
|
||||||
else:
|
c_scores = <float*>scores.data
|
||||||
scores = vec2scores(vectors)
|
for beam in todo:
|
||||||
j = 0
|
|
||||||
c_scores = <float*>scores.data
|
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
stcls = <StateClass>beam.at(i)
|
state = <StateC*>beam.at(i)
|
||||||
if not stcls.is_final():
|
if not state.is_final():
|
||||||
self.moves.set_valid(beam.is_valid[i], stcls.c)
|
self.moves.set_valid(beam.is_valid[i], state)
|
||||||
for k in range(nr_class):
|
memcpy(beam.scores[i], c_scores, nr_class * sizeof(float))
|
||||||
beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
|
c_scores += nr_class
|
||||||
j += 1
|
beam.advance(_transition_state, NULL, <void*>self.moves.c)
|
||||||
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
|
||||||
beam.check_done(_check_final_state, NULL)
|
beam.check_done(_check_final_state, NULL)
|
||||||
beams.append(beam)
|
|
||||||
tokvecs = self.model[0].ops.unflatten(tokvecs,
|
tokvecs = self.model[0].ops.unflatten(tokvecs,
|
||||||
[len(doc) for doc in docs])
|
[len(doc) for doc in docs])
|
||||||
return beams, tokvecs
|
return beams, tokvecs
|
||||||
|
@ -527,7 +545,7 @@ cdef class Parser:
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||||
return None
|
return None
|
||||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
|
||||||
return self.update_beam(docs, golds,
|
return self.update_beam(docs, golds,
|
||||||
self.cfg['beam_width'], self.cfg['beam_density'],
|
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||||
drop=drop, sgd=sgd, losses=losses)
|
drop=drop, sgd=sgd, losses=losses)
|
||||||
|
@ -965,27 +983,40 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
|
||||||
|
|
||||||
# These are passed as callbacks to thinc.search.Beam
|
# These are passed as callbacks to thinc.search.Beam
|
||||||
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||||
dest = <StateClass>_dest
|
dest = <StateC*>_dest
|
||||||
src = <StateClass>_src
|
src = <StateC*>_src
|
||||||
moves = <const Transition*>_moves
|
moves = <const Transition*>_moves
|
||||||
dest.clone(src)
|
dest.clone(src)
|
||||||
moves[clas].do(dest.c, moves[clas].label)
|
moves[clas].do(dest, moves[clas].label)
|
||||||
dest.c.push_hist(clas)
|
dest.push_hist(clas)
|
||||||
|
|
||||||
|
|
||||||
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
||||||
return (<StateClass>_state).is_final()
|
state = <StateC*>_state
|
||||||
|
return state.is_final()
|
||||||
|
|
||||||
|
|
||||||
def _cleanup(Beam beam):
|
def _cleanup(Beam beam):
|
||||||
|
cdef StateC* state
|
||||||
|
# Once parsing has finished, states in beam may not be unique. Is this
|
||||||
|
# correct?
|
||||||
|
seen = set()
|
||||||
for i in range(beam.width):
|
for i in range(beam.width):
|
||||||
Py_XDECREF(<PyObject*>beam._states[i].content)
|
addr = <size_t>beam._parents[i].content
|
||||||
Py_XDECREF(<PyObject*>beam._parents[i].content)
|
if addr not in seen:
|
||||||
|
state = <StateC*>addr
|
||||||
|
del state
|
||||||
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
seen.add(addr)
|
||||||
state = <StateClass>_state
|
else:
|
||||||
if state.c.is_final():
|
print(i, addr)
|
||||||
return 1
|
print(seen)
|
||||||
else:
|
raise Exception
|
||||||
return state.c.hash()
|
addr = <size_t>beam._states[i].content
|
||||||
|
if addr not in seen:
|
||||||
|
state = <StateC*>addr
|
||||||
|
del state
|
||||||
|
seen.add(addr)
|
||||||
|
else:
|
||||||
|
print(i, addr)
|
||||||
|
print(seen)
|
||||||
|
raise Exception
|
||||||
|
|
|
@ -13,12 +13,22 @@ from ._state cimport StateC
|
||||||
cdef class StateClass:
|
cdef class StateClass:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef StateC* c
|
cdef StateC* c
|
||||||
|
cdef int _borrowed
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline StateClass init(const TokenC* sent, int length):
|
cdef inline StateClass init(const TokenC* sent, int length):
|
||||||
cdef StateClass self = StateClass()
|
cdef StateClass self = StateClass()
|
||||||
self.c = new StateC(sent, length)
|
self.c = new StateC(sent, length)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
cdef inline StateClass borrow(StateC* ptr):
|
||||||
|
cdef StateClass self = StateClass()
|
||||||
|
del self.c
|
||||||
|
self.c = ptr
|
||||||
|
self._borrowed = 1
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline StateClass init_offset(const TokenC* sent, int length, int
|
cdef inline StateClass init_offset(const TokenC* sent, int length, int
|
||||||
|
|
|
@ -11,12 +11,14 @@ cdef class StateClass:
|
||||||
def __init__(self, Doc doc=None, int offset=0):
|
def __init__(self, Doc doc=None, int offset=0):
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
self.mem = mem
|
self.mem = mem
|
||||||
|
self._borrowed = 0
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
self.c = new StateC(doc.c, doc.length)
|
self.c = new StateC(doc.c, doc.length)
|
||||||
self.c.offset = offset
|
self.c.offset = offset
|
||||||
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
del self.c
|
if self._borrowed != 1:
|
||||||
|
del self.c
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def stack(self):
|
def stack(self):
|
||||||
|
|
|
@ -23,8 +23,7 @@ class OracleError(Exception):
|
||||||
|
|
||||||
|
|
||||||
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
||||||
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
|
cdef StateC* st = new StateC(<const TokenC*>tokens, length)
|
||||||
Py_INCREF(st)
|
|
||||||
return <void*>st
|
return <void*>st
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
|
# cython: profile=True
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
@ -274,7 +275,7 @@ cdef class Tokenizer:
|
||||||
int has_special, int n) except -1:
|
int has_special, int n) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
if tokens[i].lex.id == 0:
|
if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL:
|
||||||
return 0
|
return 0
|
||||||
# See https://github.com/explosion/spaCy/issues/1250
|
# See https://github.com/explosion/spaCy/issues/1250
|
||||||
if has_special:
|
if has_special:
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
|
# cython: profile=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -154,7 +155,10 @@ cdef class Vocab:
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
lex.orth = self.strings.add(string)
|
lex.orth = self.strings.add(string)
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
lex.id = self.length
|
if self.vectors is not None:
|
||||||
|
lex.id = self.vectors.key2row.get(lex.orth, 0)
|
||||||
|
else:
|
||||||
|
lex.id = 0
|
||||||
if self.lex_attr_getters is not None:
|
if self.lex_attr_getters is not None:
|
||||||
for attr, func in self.lex_attr_getters.items():
|
for attr, func in self.lex_attr_getters.items():
|
||||||
value = func(string)
|
value = func(string)
|
||||||
|
@ -164,9 +168,7 @@ cdef class Vocab:
|
||||||
lex.prob = value
|
lex.prob = value
|
||||||
elif value is not None:
|
elif value is not None:
|
||||||
Lexeme.set_struct_attr(lex, attr, value)
|
Lexeme.set_struct_attr(lex, attr, value)
|
||||||
if is_oov:
|
if not is_oov:
|
||||||
lex.id = 0
|
|
||||||
else:
|
|
||||||
key = hash_string(string)
|
key = hash_string(string)
|
||||||
self._add_lex_to_vocab(key, lex)
|
self._add_lex_to_vocab(key, lex)
|
||||||
assert lex != NULL, string
|
assert lex != NULL, string
|
||||||
|
|
|
@ -257,7 +257,7 @@ p
|
||||||
+row
|
+row
|
||||||
+cell #[code dev_data]
|
+cell #[code dev_data]
|
||||||
+cell positional
|
+cell positional
|
||||||
+cell Location of JSON-formatted dev data (optional).
|
+cell Location of JSON-formatted development data for evaluation.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --n-iter], #[code -n]
|
+cell #[code --n-iter], #[code -n]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user