From 86ddf692a175aad092cba736f73a7a960e174116 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 14 Nov 2017 01:37:10 +0100 Subject: [PATCH 01/14] Fix bug in limit calculation on dev data --- spacy/gold.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index ea4e1817b..d6db9b853 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -179,7 +179,7 @@ class GoldCorpus(object): gold_tuples = read_json_file(loc) for item in gold_tuples: yield item - i += 1 + i += len(item[1]) if self.limit and i >= self.limit: break From 2512ea9eeba228c59d229029bfcb0775ca622148 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 14 Nov 2017 02:11:40 +0100 Subject: [PATCH 02/14] Fix memory leak in beam parser --- spacy/syntax/_beam_utils.pyx | 51 +++++++++---------- spacy/syntax/arc_eager.pyx | 28 ++++++----- spacy/syntax/ner.pyx | 28 +++++------ spacy/syntax/nn_parser.pyx | 78 +++++++++++++++++++----------- spacy/syntax/stateclass.pxd | 10 ++++ spacy/syntax/stateclass.pyx | 4 +- spacy/syntax/transition_system.pyx | 3 +- 7 files changed, 118 insertions(+), 84 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 8a1e5a5fe..b26d13e4d 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -9,36 +9,31 @@ from thinc.typedefs cimport hash_t, class_t from thinc.extra.search cimport MaxViolation from .transition_system cimport TransitionSystem, Transition -from .stateclass cimport StateClass from ..gold cimport GoldParse +from .stateclass cimport StateC, StateClass # These are passed as callbacks to thinc.search.Beam cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: - dest = _dest - src = _src + dest = _dest + src = _src moves = _moves dest.clone(src) - moves[clas].do(dest.c, moves[clas].label) - dest.c.push_hist(clas) + moves[clas].do(dest, moves[clas].label) + dest.push_hist(clas) cdef int _check_final_state(void* _state, void* extra_args) except -1: - return (_state).is_final() - - -def _cleanup(Beam beam): - for i in range(beam.width): - Py_XDECREF(beam._states[i].content) - Py_XDECREF(beam._parents[i].content) + state = _state + return state.is_final() cdef hash_t _hash_state(void* _state, void* _) except 0: - state = _state - if state.c.is_final(): + state = _state + if state.is_final(): return 1 else: - return state.c.hash() + return state.hash() cdef class ParserBeam(object): @@ -55,14 +50,15 @@ cdef class ParserBeam(object): self.golds = golds self.beams = [] cdef Beam beam - cdef StateClass state, st + cdef StateClass state + cdef StateC* st for state in states: beam = Beam(self.moves.n_moves, width, density) beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent) for i in range(beam.width): - st = beam.at(i) - st.c.offset = state.c.offset + st = beam.at(i) + st.offset = state.c.offset self.beams.append(beam) self.dones = [False] * len(self.beams) @@ -86,13 +82,14 @@ cdef class ParserBeam(object): if self.golds is not None: self._set_costs(beam, self.golds[i], follow_gold=follow_gold) if follow_gold: - beam.advance(_transition_state, NULL, self.moves.c) + beam.advance(_transition_state, _hash_state, self.moves.c) else: beam.advance(_transition_state, _hash_state, self.moves.c) beam.check_done(_check_final_state, NULL) + # This handles the non-monotonic stuff for the parser. if beam.is_done and self.golds is not None: for j in range(beam.size): - state = beam.at(j) + state = StateClass.borrow(beam.at(j)) if state.is_final(): try: if self.moves.is_gold_parse(state, self.golds[i]): @@ -107,11 +104,11 @@ cdef class ParserBeam(object): cdef int nr_state = min(scores.shape[0], beam.size) cdef int nr_class = scores.shape[1] for i in range(nr_state): - state = beam.at(i) + state = beam.at(i) if not state.is_final(): for j in range(nr_class): beam.scores[i][j] = c_scores[i * nr_class + j] - self.moves.set_valid(beam.is_valid[i], state.c) + self.moves.set_valid(beam.is_valid[i], state) else: for j in range(beam.nr_class): beam.scores[i][j] = 0 @@ -119,8 +116,8 @@ cdef class ParserBeam(object): def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False): for i in range(beam.size): - state = beam.at(i) - if not state.c.is_final(): + state = StateClass.borrow(beam.at(i)) + if not state.is_final(): self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) if follow_gold: @@ -157,7 +154,7 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, pbeam = ParserBeam(moves, states, golds, width=width, density=density) gbeam = ParserBeam(moves, states, golds, - width=width, density=0.0) + width=width, density=density) cdef StateClass state beam_maps = [] backprops = [] @@ -231,7 +228,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update): p_indices.append([]) g_indices.append([]) for i in range(pbeam.size): - state = pbeam.at(i) + state = StateClass.borrow(pbeam.at(i)) if not state.is_final(): key = tuple([eg_id] + pbeam.histories[i]) assert key not in seen, (key, seen) @@ -240,7 +237,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update): states.append(state) beam_map.update(seen) for i in range(gbeam.size): - state = gbeam.at(i) + state = StateClass.borrow(gbeam.at(i)) if not state.is_final(): key = tuple([eg_id] + gbeam.histories[i]) if key in seen: diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index b3c9b5563..16d55db24 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -292,12 +292,16 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil: cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: - cdef StateClass st = StateClass.init(tokens, length) - for i in range(st.c.length): - st.c._sent[i].l_edge = i - st.c._sent[i].r_edge = i + st = new StateC(tokens, length) + for i in range(st.length): + if st._sent[i].dep == 0: + st._sent[i].l_edge = i + st._sent[i].r_edge = i + st._sent[i].head = 0 + st._sent[i].dep = 0 + st._sent[i].l_kids = 0 + st._sent[i].r_kids = 0 st.fast_forward() - Py_INCREF(st) return st @@ -533,18 +537,18 @@ cdef class ArcEager(TransitionSystem): assert n_gold >= 1 def get_beam_annot(self, Beam beam): - length = (beam.at(0)).c.length + length = (beam.at(0)).length heads = [{} for _ in range(length)] deps = [{} for _ in range(length)] probs = beam.probs for i in range(beam.size): - stcls = beam.at(i) - self.finalize_state(stcls.c) - if stcls.is_final(): + state = beam.at(i) + self.finalize_state(state) + if state.is_final(): prob = probs[i] - for j in range(stcls.c.length): - head = j + stcls.c._sent[j].head - dep = stcls.c._sent[j].dep + for j in range(state.length): + head = j + state._sent[j].head + dep = state._sent[j].dep heads[j].setdefault(head, 0.0) heads[j][head] += prob deps[j].setdefault(dep, 0.0) diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index e2e242aea..999760ce0 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -123,14 +123,14 @@ cdef class BiluoPushDown(TransitionSystem): entities = {} probs = beam.probs for i in range(beam.size): - stcls = beam.at(i) - if stcls.is_final(): - self.finalize_state(stcls.c) + state = beam.at(i) + if state.is_final(): + self.finalize_state(state) prob = probs[i] - for j in range(stcls.c._e_i): - start = stcls.c._ents[j].start - end = stcls.c._ents[j].end - label = stcls.c._ents[j].label + for j in range(state._e_i): + start = state._ents[j].start + end = state._ents[j].end + label = state._ents[j].label entities.setdefault((start, end, label), 0.0) entities[(start, end, label)] += prob return entities @@ -139,15 +139,15 @@ cdef class BiluoPushDown(TransitionSystem): parses = [] probs = beam.probs for i in range(beam.size): - stcls = beam.at(i) - if stcls.is_final(): - self.finalize_state(stcls.c) + state = beam.at(i) + if state.is_final(): + self.finalize_state(state) prob = probs[i] parse = [] - for j in range(stcls.c._e_i): - start = stcls.c._ents[j].start - end = stcls.c._ents[j].end - label = stcls.c._ents[j].label + for j in range(state._e_i): + start = state._ents[j].start + end = state._ents[j].end + label = state._ents[j].label parse.append((start, end, self.strings[label])) parses.append((prob, parse)) return parses diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 73da8139d..3ed59d3e8 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -224,6 +224,16 @@ cdef void cpu_regression_loss(float* d_scores, d_scores[i] = diff +def _collect_states(beams): + cdef StateClass state + cdef Beam beam + states = [] + for beam in beams: + state = StateClass.borrow(beam.at(0)) + states.append(state) + return states + + cdef class Parser: """ Base class of the DependencyParser and EntityRecognizer. @@ -336,7 +346,7 @@ cdef class Parser: beam_density=beam_density) beam = beams[0] output = self.moves.get_beam_annot(beam) - state = beam.at(0) + state = StateClass.borrow(beam.at(0)) self.set_annotations([doc], [state], tensors=tokvecs) _cleanup(beam) return output @@ -356,10 +366,10 @@ cdef class Parser: if beam_density is None: beam_density = self.cfg.get('beam_density', 0.0) cdef Doc doc - cdef Beam beam for batch in cytoolz.partition_all(batch_size, docs): - batch = list(batch) - by_length = sorted(list(batch), key=lambda doc: len(doc)) + batch_in_order = list(batch) + by_length = sorted(batch_in_order, key=lambda doc: len(doc)) + batch_beams = [] for subbatch in cytoolz.partition_all(8, by_length): subbatch = list(subbatch) if beam_width == 1: @@ -369,21 +379,20 @@ cdef class Parser: beams, tokvecs = self.beam_parse(subbatch, beam_width=beam_width, beam_density=beam_density) - parse_states = [] - for beam in beams: - parse_states.append(beam.at(0)) - self.set_annotations(subbatch, parse_states, tensors=tokvecs) - yield from batch - for beam in beams: - _cleanup(beam) + parse_states = _collect_states(beams) + self.set_annotations(subbatch, parse_states, tensors=None) + for beam in beams: + _cleanup(beam) + for doc in batch_in_order: + yield doc def parse_batch(self, docs): cdef: precompute_hiddens state2vec - StateClass stcls Pool mem const float* feat_weights StateC* st + StateClass stcls vector[StateC*] states int guess, nr_class, nr_feat, nr_piece, nr_dim, nr_state, nr_step int j @@ -488,14 +497,14 @@ cdef class Parser: beam = Beam(nr_class, beam_width, min_density=beam_density) beam.initialize(self.moves.init_beam_state, doc.length, doc.c) for i in range(beam.width): - stcls = beam.at(i) - stcls.c.offset = offset + state = beam.at(i) + state.offset = offset offset += len(doc) beam.check_done(_check_final_state, NULL) while not beam.is_done: states = [] for i in range(beam.size): - stcls = beam.at(i) + stcls = StateClass.borrow(beam.at(i)) # This way we avoid having to score finalized states # We do have to take care to keep indexes aligned, though if not stcls.is_final(): @@ -511,9 +520,9 @@ cdef class Parser: j = 0 c_scores = scores.data for i in range(beam.size): - stcls = beam.at(i) - if not stcls.is_final(): - self.moves.set_valid(beam.is_valid[i], stcls.c) + state = beam.at(i) + if not state.is_final(): + self.moves.set_valid(beam.is_valid[i], state) for k in range(nr_class): beam.scores[i][k] = c_scores[j * scores.shape[1] + k] j += 1 @@ -965,27 +974,40 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no # These are passed as callbacks to thinc.search.Beam cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: - dest = _dest - src = _src + dest = _dest + src = _src moves = _moves dest.clone(src) - moves[clas].do(dest.c, moves[clas].label) - dest.c.push_hist(clas) + moves[clas].do(dest, moves[clas].label) + dest.push_hist(clas) cdef int _check_final_state(void* _state, void* extra_args) except -1: - return (_state).is_final() + state = _state + return state.is_final() def _cleanup(Beam beam): + cdef StateC* state + # Once parsing has finished, states in beam may not be unique. Is this + # correct? + seen = set() for i in range(beam.width): - Py_XDECREF(beam._states[i].content) - Py_XDECREF(beam._parents[i].content) + addr = beam._parents[i].content + if addr not in seen: + state = addr + del state + seen.add(addr) + addr = beam._states[i].content + if addr not in seen: + state = addr + del state + seen.add(addr) cdef hash_t _hash_state(void* _state, void* _) except 0: - state = _state - if state.c.is_final(): + state = _state + if state.is_final(): return 1 else: - return state.c.hash() + return state.hash() diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 0ae83ee27..0a9be3b7f 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -13,12 +13,22 @@ from ._state cimport StateC cdef class StateClass: cdef Pool mem cdef StateC* c + cdef int _borrowed @staticmethod cdef inline StateClass init(const TokenC* sent, int length): cdef StateClass self = StateClass() self.c = new StateC(sent, length) return self + + @staticmethod + cdef inline StateClass borrow(StateC* ptr): + cdef StateClass self = StateClass() + del self.c + self.c = ptr + self._borrowed = 1 + return self + @staticmethod cdef inline StateClass init_offset(const TokenC* sent, int length, int diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index ea0ec77e5..2a15a2de1 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -11,12 +11,14 @@ cdef class StateClass: def __init__(self, Doc doc=None, int offset=0): cdef Pool mem = Pool() self.mem = mem + self._borrowed = 0 if doc is not None: self.c = new StateC(doc.c, doc.length) self.c.offset = offset def __dealloc__(self): - del self.c + if self._borrowed != 1: + del self.c @property def stack(self): diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index c351636c4..94b1ef2b1 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -23,8 +23,7 @@ class OracleError(Exception): cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: - cdef StateClass st = StateClass.init(tokens, length) - Py_INCREF(st) + cdef StateC* st = new StateC(tokens, length) return st From 40c4e8fc0952b0a7905c3cbafe1aeb7c9be47e2c Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 14 Nov 2017 20:26:05 +0100 Subject: [PATCH 03/14] Remove "optional" from dev_data arg and add more info (see #1578) --- website/api/cli.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/api/cli.jade b/website/api/cli.jade index b0913e60c..cb98ec279 100644 --- a/website/api/cli.jade +++ b/website/api/cli.jade @@ -257,7 +257,7 @@ p +row +cell #[code dev_data] +cell positional - +cell Location of JSON-formatted dev data (optional). + +cell Location of JSON-formatted development data for evaluation. +row +cell #[code --n-iter], #[code -n] From 855872f872196f28fde4a0a2e09e01829b4d26b2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 14 Nov 2017 23:36:46 +0100 Subject: [PATCH 04/14] Remove state hashing --- spacy/syntax/_beam_utils.pyx | 5 +---- spacy/syntax/nn_parser.pyx | 18 +++++++++--------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index b26d13e4d..20cbbbaaa 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -81,10 +81,7 @@ cdef class ParserBeam(object): self._set_scores(beam, scores[i]) if self.golds is not None: self._set_costs(beam, self.golds[i], follow_gold=follow_gold) - if follow_gold: - beam.advance(_transition_state, _hash_state, self.moves.c) - else: - beam.advance(_transition_state, _hash_state, self.moves.c) + beam.advance(_transition_state, NULL, self.moves.c) beam.check_done(_check_final_state, NULL) # This handles the non-monotonic stuff for the parser. if beam.is_done and self.golds is not None: diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 3ed59d3e8..49ffe2062 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -526,7 +526,7 @@ cdef class Parser: for k in range(nr_class): beam.scores[i][k] = c_scores[j * scores.shape[1] + k] j += 1 - beam.advance(_transition_state, _hash_state, self.moves.c) + beam.advance(_transition_state, NULL, self.moves.c) beam.check_done(_check_final_state, NULL) beams.append(beam) tokvecs = self.model[0].ops.unflatten(tokvecs, @@ -998,16 +998,16 @@ def _cleanup(Beam beam): state = addr del state seen.add(addr) + else: + print(i, addr) + print(seen) + raise Exception addr = beam._states[i].content if addr not in seen: state = addr del state seen.add(addr) - - -cdef hash_t _hash_state(void* _state, void* _) except 0: - state = _state - if state.is_final(): - return 1 - else: - return state.hash() + else: + print(i, addr) + print(seen) + raise Exception From d274d3a3b9f4625ab0854b7896e2134d3be0f4bc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 15 Nov 2017 00:51:42 +0100 Subject: [PATCH 05/14] Let beam forward use minibatches --- spacy/syntax/nn_parser.pyx | 55 ++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 49ffe2062..bb01cecf1 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -17,7 +17,7 @@ from cpython.ref cimport PyObject, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from libc.math cimport exp from libcpp.vector cimport vector -from libc.string cimport memset +from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t, class_t, hash_t @@ -485,14 +485,14 @@ cdef class Parser: cdef np.ndarray scores cdef Doc doc cdef int nr_class = self.moves.n_moves - cdef StateClass stcls, output cuda_stream = util.get_cuda_stream() (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( docs, cuda_stream, 0.0) - beams = [] cdef int offset = 0 cdef int j = 0 cdef int k + + beams = [] for doc in docs: beam = Beam(nr_class, beam_width, min_density=beam_density) beam.initialize(self.moves.init_beam_state, doc.length, doc.c) @@ -501,34 +501,43 @@ cdef class Parser: state.offset = offset offset += len(doc) beam.check_done(_check_final_state, NULL) - while not beam.is_done: - states = [] + beams.append(beam) + cdef np.ndarray token_ids + token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), + dtype='i', order='C') + todo = [beam for beam in beams if not beam.is_done] + + cdef int* c_ids + cdef int nr_feature = self.nr_feature + cdef int n_states + while todo: + todo = [beam for beam in beams if not beam.is_done] + token_ids.fill(-1) + c_ids = token_ids.data + n_states = 0 + for beam in todo: for i in range(beam.size): - stcls = StateClass.borrow(beam.at(i)) + state = beam.at(i) # This way we avoid having to score finalized states # We do have to take care to keep indexes aligned, though - if not stcls.is_final(): - states.append(stcls) - token_ids = self.get_token_ids(states) - vectors = state2vec(token_ids) - if self.cfg.get('hist_size', 0): - hists = numpy.asarray([st.history[:self.cfg['hist_size']] - for st in states], dtype='i') - scores = vec2scores((vectors, hists)) - else: - scores = vec2scores(vectors) - j = 0 - c_scores = scores.data + if not state.is_final(): + state.set_context_tokens(c_ids, nr_feature) + c_ids += nr_feature + n_states += 1 + if n_states == 0: + break + vectors = state2vec(token_ids[:n_states]) + scores = vec2scores(vectors) + c_scores = scores.data + for beam in todo: for i in range(beam.size): state = beam.at(i) if not state.is_final(): self.moves.set_valid(beam.is_valid[i], state) - for k in range(nr_class): - beam.scores[i][k] = c_scores[j * scores.shape[1] + k] - j += 1 + memcpy(beam.scores[i], c_scores, nr_class * sizeof(float)) + c_scores += nr_class beam.advance(_transition_state, NULL, self.moves.c) beam.check_done(_check_final_state, NULL) - beams.append(beam) tokvecs = self.model[0].ops.unflatten(tokvecs, [len(doc) for doc in docs]) return beams, tokvecs @@ -536,7 +545,7 @@ cdef class Parser: def update(self, docs, golds, drop=0., sgd=None, losses=None): if not any(self.moves.has_gold(gold) for gold in golds): return None - if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: + if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0: return self.update_beam(docs, golds, self.cfg['beam_width'], self.cfg['beam_density'], drop=drop, sgd=sgd, losses=losses) From 334ed433b2cc392e8bb16cd779dafb1c41ca96d8 Mon Sep 17 00:00:00 2001 From: yogendrasoni Date: Wed, 15 Nov 2017 13:55:08 +0530 Subject: [PATCH 06/14] rstrip line before rsplit loading english fast text giving error because line contains new line at the end and rsplit is splitting it incorrectly --- examples/vectors_fast_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py index 1544e1d5e..4e5640f0d 100644 --- a/examples/vectors_fast_text.py +++ b/examples/vectors_fast_text.py @@ -29,7 +29,7 @@ def main(vectors_loc, lang=None): nr_row, nr_dim = header.split() nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: - line = line.decode('utf8') + line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', nr_dim) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') From c9d72de0fb9814e39971ce79d32d62b21c425154 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 15 Nov 2017 12:44:02 +0100 Subject: [PATCH 07/14] Add dummy serialization methods for Japanese and missing lang getter (resolves #1557) --- spacy/lang/ja/__init__.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 04cc013a4..3b67c5489 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -21,8 +21,25 @@ class JapaneseTokenizer(object): words = [x.surface for x in self.tokenizer.tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) + # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to + # allow serialization (see #1557) + def to_bytes(self, **exclude): + return b'' + + def from_bytes(self, bytes_data, **exclude): + return self + + def to_disk(self, path, **exclude): + return None + + def from_disk(self, path, **exclude): + return self + class JapaneseDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ja' + @classmethod def create_tokenizer(cls, nlp=None): return JapaneseTokenizer(cls, nlp) From 8d692771f68f0b510c5c4f4f58ba9ddb3c677c81 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 15 Nov 2017 13:51:25 +0100 Subject: [PATCH 08/14] Improve profiling --- spacy/cli/profile.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index a394989d0..5c0ed521b 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -11,6 +11,7 @@ import spacy import sys import tqdm import cytoolz +import thinc.extra.datasets def read_inputs(loc): @@ -32,14 +33,18 @@ def profile(cmd, lang, inputs=None): """ Profile a spaCy pipeline, to find out which functions take the most time. """ + if inputs is None: + imdb_train, _ = thinc.extra.datasets.imdb() + inputs, _ = zip(*imdb_train) + inputs = inputs[:2000] nlp = spacy.load(lang) texts = list(cytoolz.take(10000, inputs)) cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") - s.strip_dirs().sort_stats("time").print_stats() + s.strip_dirs().sort_stats("cumtime").print_stats() def parse_texts(nlp, texts): - for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128): + for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): pass From fe3c42a06b89dd819c2329af6a3bbc46b51a869f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 15 Nov 2017 13:55:46 +0100 Subject: [PATCH 09/14] Fix caching in tokenizer --- spacy/tokenizer.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 67ff47743..095fbf4ad 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,4 +1,5 @@ # cython: embedsignature=True +# cython: profile=True # coding: utf8 from __future__ import unicode_literals @@ -268,7 +269,7 @@ cdef class Tokenizer: int has_special, int n) except -1: cdef int i for i in range(n): - if tokens[i].lex.id == 0: + if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL: return 0 # See https://github.com/explosion/spaCy/issues/1250 if has_special: From 2f169fdb0a62e087e4d14f867b2ddd85b40a3983 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 15 Nov 2017 13:58:03 +0100 Subject: [PATCH 10/14] Set lex ID correctly for new tokens in Vocab --- spacy/vocab.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 675e4a805..cc344ef73 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,4 +1,5 @@ # coding: utf8 +# cython: profile=True from __future__ import unicode_literals import numpy @@ -154,7 +155,7 @@ cdef class Vocab: lex = mem.alloc(sizeof(LexemeC), 1) lex.orth = self.strings.add(string) lex.length = len(string) - lex.id = self.length + lex.id = self.vectors.key2row.get(lex.orth, 0) if self.lex_attr_getters is not None: for attr, func in self.lex_attr_getters.items(): value = func(string) @@ -164,9 +165,7 @@ cdef class Vocab: lex.prob = value elif value is not None: Lexeme.set_struct_attr(lex, attr, value) - if is_oov: - lex.id = 0 - else: + if not is_oov: key = hash_string(string) self._add_lex_to_vocab(key, lex) assert lex != NULL, string From 8e6524788689e1dfb3303a7a7b38b65a051bf5d7 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 15 Nov 2017 14:23:58 +0100 Subject: [PATCH 11/14] Fix lex.id if vectors is None --- spacy/vocab.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index cc344ef73..59df8cc1b 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -155,7 +155,10 @@ cdef class Vocab: lex = mem.alloc(sizeof(LexemeC), 1) lex.orth = self.strings.add(string) lex.length = len(string) - lex.id = self.vectors.key2row.get(lex.orth, 0) + if self.vectors is not None: + lex.id = self.vectors.key2row.get(lex.orth, 0) + else: + lex.id = 0 if self.lex_attr_getters is not None: for attr, func in self.lex_attr_getters.items(): value = func(string) From 716ccbb71e0124155bd2007443a01cab3adbb9b9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 15 Nov 2017 14:59:34 +0100 Subject: [PATCH 12/14] Require thinc 6.10.1 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 01e41c993..884d79829 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pathlib numpy>=1.7 cymem>=1.30,<1.32 preshed>=1.0.0,<2.0.0 -thinc>=6.10.0,<6.11.0 +thinc>=6.10.1,<6.11.0 murmurhash>=0.28,<0.29 plac<1.0.0,>=0.9.6 six diff --git a/setup.py b/setup.py index 727df5e4e..11acae95c 100755 --- a/setup.py +++ b/setup.py @@ -190,7 +190,7 @@ def setup_package(): 'murmurhash>=0.28,<0.29', 'cymem>=1.30,<1.32', 'preshed>=1.0.0,<2.0.0', - 'thinc>=6.10.0,<6.11.0', + 'thinc>=6.10.1,<6.11.0', 'plac<1.0.0,>=0.9.6', 'six', 'pathlib', From cf0be620967b16e3ad910a27c254587f0f319f63 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 15 Nov 2017 15:00:18 +0100 Subject: [PATCH 13/14] Increment version --- spacy/about.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index fdaf3ccff..da35cd8ef 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,13 +3,13 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '2.0.2' +__version__ = '2.0.3.dev0' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Explosion AI' __email__ = 'contact@explosion.ai' __license__ = 'MIT' -__release__ = True +__release__ = False __docs_models__ = 'https://spacy.io/usage/models' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' From b60d92aca8ec40f2380319ab43119c15735bd784 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 15 Nov 2017 16:14:46 +0100 Subject: [PATCH 14/14] Increment version --- spacy/about.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index da35cd8ef..8c63531a4 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,13 +3,13 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '2.0.3.dev0' +__version__ = '2.0.3' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Explosion AI' __email__ = 'contact@explosion.ai' __license__ = 'MIT' -__release__ = False +__release__ = True __docs_models__ = 'https://spacy.io/usage/models' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'