From bdaac7ab445c247e8137950ee66d698806a4830c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 May 2017 02:59:31 -0500 Subject: [PATCH] WIP on improving parser efficiency --- spacy/cli/train.py | 17 ++- spacy/gold.pyx | 19 ++-- spacy/language.py | 2 +- spacy/matcher.pyx | 3 +- spacy/pipeline.pyx | 20 +--- spacy/syntax/nn_parser.pxd | 4 +- spacy/syntax/nn_parser.pyx | 139 +++++++++++++++--------- spacy/tests/regression/test_issue429.py | 5 +- spacy/tests/test_matcher.py | 4 +- 9 files changed, 119 insertions(+), 94 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 2945794e7..07e97fe1e 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -9,6 +9,7 @@ from pathlib import Path import dill import tqdm from thinc.neural.optimizers import linear_decay +from timeit import default_timer as timer from ..tokens.doc import Doc from ..scorer import Scorer @@ -81,8 +82,13 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, batch_size = min(batch_size, max_batch_size) dropout = linear_decay(orig_dropout, dropout_decay, i*n_train_docs+idx) with nlp.use_params(optimizer.averages): + start = timer() scorer = nlp.evaluate(corpus.dev_docs(nlp)) - print_progress(i, {}, scorer.scores) + end = timer() + n_words = scorer.tokens.tp + scorer.tokens.fn + assert n_words != 0 + wps = n_words / (end-start) + print_progress(i, {}, scorer.scores, wps=wps) with (output_path / 'model.bin').open('wb') as file_: with nlp.use_params(optimizer.averages): dill.dump(nlp, file_, -1) @@ -98,14 +104,14 @@ def _render_parses(i, to_render): file_.write(html) -def print_progress(itn, losses, dev_scores): - # TODO: Fix! +def print_progress(itn, losses, dev_scores, wps=0.0): scores = {} for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', - 'ents_p', 'ents_r', 'ents_f']: + 'ents_p', 'ents_r', 'ents_f', 'wps']: scores[col] = 0.0 scores.update(losses) scores.update(dev_scores) + scores[wps] = wps tpl = '\t'.join(( '{:d}', '{dep_loss:.3f}', @@ -115,7 +121,8 @@ def print_progress(itn, losses, dev_scores): '{ents_r:.3f}', '{ents_f:.3f}', '{tags_acc:.3f}', - '{token_acc:.3f}')) + '{token_acc:.3f}', + '{wps:.1f}')) print(tpl.format(itn, **scores)) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 651cefe2f..53bd25890 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -144,7 +144,7 @@ def _min_edit_path(cand_words, gold_words): class GoldCorpus(object): """An annotated corpus, using the JSON file format. Manages annotations for tagging, dependency parsing and NER.""" - def __init__(self, train_path, dev_path, limit=None): + def __init__(self, train_path, dev_path, gold_preproc=True, limit=None): """Create a GoldCorpus. train_path (unicode or Path): File or directory of training data. @@ -184,7 +184,7 @@ class GoldCorpus(object): n += 1 return n - def train_docs(self, nlp, shuffle=0, gold_preproc=True, + def train_docs(self, nlp, shuffle=0, gold_preproc=False, projectivize=False): train_tuples = self.train_tuples if projectivize: @@ -195,7 +195,7 @@ class GoldCorpus(object): gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) yield from gold_docs - def dev_docs(self, nlp, gold_preproc=True): + def dev_docs(self, nlp, gold_preproc=False): gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) gold_docs = nlp.preprocess_gold(gold_docs) yield from gold_docs @@ -203,6 +203,11 @@ class GoldCorpus(object): @classmethod def iter_gold_docs(cls, nlp, tuples, gold_preproc): for raw_text, paragraph_tuples in tuples: + if gold_preproc: + raw_text = None + else: + paragraph_tuples = merge_sents(paragraph_tuples) + docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc) golds = cls._make_golds(docs, paragraph_tuples) @@ -211,15 +216,11 @@ class GoldCorpus(object): @classmethod def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc): - if gold_preproc: - return [Doc(nlp.vocab, words=sent_tuples[0][1]) - for sent_tuples in paragraph_tuples] - elif raw_text is not None: + if raw_text is not None: return [nlp.make_doc(raw_text)] else: - docs = [Doc(nlp.vocab, words=sent_tuples[0][1]) + return [Doc(nlp.vocab, words=sent_tuples[0][1]) for sent_tuples in paragraph_tuples] - return merge_sents(docs) @classmethod def _make_golds(cls, docs, paragraph_tuples): diff --git a/spacy/language.py b/spacy/language.py index 37f7ae207..cc4c29867 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -334,7 +334,7 @@ class Language(object): >>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4): >>> assert doc.is_parsed """ - #docs = (self.make_doc(text) for text in texts) + docs = (self.make_doc(text) for text in texts) docs = texts for proc in self.pipeline: name = getattr(proc, 'name', None) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 24bb7b65e..20e2a8993 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -215,7 +215,7 @@ cdef class Matcher: """ return len(self._patterns) - def add(self, key, on_match, *patterns): + def add(self, key, *patterns, **kwargs): """Add a match-rule to the matcher. A match-rule consists of: an ID key, an on_match callback, and one or more patterns. If the key exists, the patterns are appended to the @@ -227,6 +227,7 @@ cdef class Matcher: descriptors can also include quantifiers. There are currently important known problems with the quantifiers – see the docs. """ + on_match = kwargs.get('on_match', None) for pattern in patterns: if len(pattern) == 0: msg = ("Cannot add pattern for zero tokens to matcher.\n" diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index cb68846af..af71b1ad6 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -167,7 +167,7 @@ class NeuralTagger(object): self.model = model def __call__(self, doc): - tags = self.predict(doc.tensor) + tags = self.predict([doc.tensor]) self.set_annotations([doc], tags) def pipe(self, stream, batch_size=128, n_threads=-1): @@ -340,24 +340,6 @@ cdef class NeuralEntityRecognizer(NeuralParser): nr_feature = 6 - def get_token_ids(self, states): - cdef StateClass state - cdef int n_tokens = 6 - ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c') - for i, state in enumerate(states): - ids[i, 0] = state.c.B(0)-1 - ids[i, 1] = state.c.B(0) - ids[i, 2] = state.c.B(1) - ids[i, 3] = state.c.E(0) - ids[i, 4] = state.c.E(0)-1 - ids[i, 5] = state.c.E(0)+1 - for j in range(6): - if ids[i, j] >= state.c.length: - ids[i, j] = -1 - if ids[i, j] >= 0: - ids[i, j] += state.c.offset - return ids - cdef class BeamDependencyParser(BeamParser): TransitionSystem = ArcEager diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd index 8692185e5..f6963ea18 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/syntax/nn_parser.pxd @@ -15,7 +15,7 @@ cdef class Parser: cdef readonly object cfg cdef void _parse_step(self, StateC* state, - const float* feat_weights, - int nr_class, int nr_feat) nogil + int* token_ids, float* scores, int* is_valid, + const float* feat_weights, int nr_class, int nr_feat) nogil #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 995ff5278..1b96bae36 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -19,6 +19,7 @@ import numpy.random cimport numpy as np from libcpp.vector cimport vector +from libcpp.pair cimport pair from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals from libc.stdint cimport uint32_t, uint64_t @@ -68,6 +69,9 @@ def set_debug(val): DEBUG = val +ctypedef pair[int, StateC*] step_t + + cdef class precompute_hiddens: '''Allow a model to be "primed" by pre-computing input features in bulk. @@ -119,6 +123,9 @@ cdef class precompute_hiddens: self._is_synchronized = True return self._cached.data + def get_bp_hiddens(self): + return self._bp_hiddens + def __call__(self, X): return self.begin_update(X)[0] @@ -308,7 +315,6 @@ cdef class Parser: cdef: precompute_hiddens state2vec StateClass state - Pool mem const float* feat_weights StateC* st vector[StateC*] next_step, this_step @@ -336,7 +342,14 @@ cdef class Parser: cdef int i while not next_step.empty(): for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True): - self._parse_step(next_step[i], feat_weights, nr_class, nr_feat) + token_ids = calloc(nr_feat, sizeof(int)) + scores = calloc(nr_class, sizeof(float)) + is_valid = calloc(nr_class, sizeof(int)) + self._parse_step(next_step[i], token_ids, scores, is_valid, + feat_weights, nr_class, nr_feat) + free(is_valid) + free(scores) + free(token_ids) this_step, next_step = next_step, this_step next_step.clear() for st in this_step: @@ -345,12 +358,8 @@ cdef class Parser: return states cdef void _parse_step(self, StateC* state, - const float* feat_weights, - int nr_class, int nr_feat) nogil: - token_ids = calloc(nr_feat, sizeof(int)) - scores = calloc(nr_class, sizeof(float)) - is_valid = calloc(nr_class, sizeof(int)) - + int* token_ids, float* scores, int* is_valid, + const float* feat_weights, int nr_class, int nr_feat) nogil: state.set_context_tokens(token_ids, nr_feat) sum_state_features(scores, feat_weights, token_ids, 1, nr_feat, nr_class) @@ -359,66 +368,90 @@ cdef class Parser: action = self.moves.c[guess] action.do(state, action.label) - free(is_valid) - free(scores) - free(token_ids) - def update(self, docs_tokvecs, golds, drop=0., sgd=None): + cdef: + precompute_hiddens state2vec + StateClass state + const float* feat_weights + StateC* st + vector[step_t] next_step, this_step + cdef int[:, ::1] is_valid, token_ids + cdef float[:, ::1] scores, d_scores, costs + int nr_state, nr_feat, nr_class + docs, tokvec_lists = docs_tokvecs - tokvecs = self.model[0].ops.flatten(tokvec_lists) if isinstance(docs, Doc) and isinstance(golds, GoldParse): docs = [docs] golds = [golds] + assert len(docs) == len(golds) == len(tokvec_lists) + nr_state = len(docs) + nr_feat = self.nr_feature + nr_class = self.moves.n_moves + + token_ids = numpy.zeros((nr_state, nr_feat), dtype='i') + is_valid = numpy.zeros((nr_state, nr_class), dtype='i') + scores = numpy.zeros((nr_state, nr_class), dtype='f') + d_scores = numpy.zeros((nr_state, nr_class), dtype='f') + costs = numpy.zeros((nr_state, nr_class), dtype='f') + + tokvecs = self.model[0].ops.flatten(tokvec_lists) cuda_stream = get_cuda_stream() + state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs, + cuda_stream, drop) + golds = [self.moves.preprocess_gold(g) for g in golds] - states = self.moves.init_batch(docs) - state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, - drop) - - todo = [(s, g) for (s, g) in zip(states, golds) - if not s.is_final() and g is not None] + cdef step_t step + cdef int i + for i, state in enumerate(states): + if not state.c.is_final(): + step.first = i + step.second = state.c + next_step.push_back(step) + self.moves.set_costs(&is_valid[i, 0], &costs[i, 0], state, golds[i]) + feat_weights = state2vec.get_feat_weights() + bp_hiddens = state2vec.get_bp_hiddens() + d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) backprops = [] - cdef float loss = 0. - while len(todo) >= 3: - states, golds = zip(*todo) - token_ids = self.get_token_ids(states) - vector, bp_vector = state2vec.begin_update(token_ids, drop=drop) - scores, bp_scores = vec2scores.begin_update(vector, drop=drop) + while next_step.size(): + # Allocate these each step, so copy an be async + np_token_ids = numpy.zeros((nr_state, nr_feat), dtype='i') + np_d_scores = numpy.zeros((nr_state, nr_class), dtype='f') + token_ids = np_token_ids + d_scores = np_d_scores + for step in next_step: + i = step.first + st = step.second + self._parse_step(st, &token_ids[i, 0], + &scores[i, 0], &is_valid[i, 0], + feat_weights, nr_class, nr_feat) + cpu_log_loss(&d_scores[i, 0], + &costs[i, 0], &is_valid[i, 0], &scores[i, 0], nr_class) + backprops.append(( + get_async(cuda_stream, np_token_ids), + get_async(cuda_stream, np_d_scores))) + this_step, next_step = next_step, this_step + next_step.clear() + for step in this_step: + i = step.first + st = step.second + if not st.is_final(): + next_step.push_back(step) + self.moves.set_costs(&is_valid[i, 0], &costs[i, 0], + states[i], golds[i]) + cuda_stream.synchronize() + for gpu_token_ids, gpu_d_scores in backprops: + d_features = bp_hiddens((gpu_d_scores, gpu_token_ids), sgd) + d_features *= (gpu_token_ids >= 0).reshape((nr_state, nr_feat, 1)) - d_scores = self.get_batch_loss(states, golds, scores) - d_vector = bp_scores(d_scores, sgd=sgd) - - if isinstance(self.model[0].ops, CupyOps) \ - and not isinstance(token_ids, state2vec.ops.xp.ndarray): - # Move token_ids and d_vector to CPU, asynchronously - backprops.append(( - get_async(cuda_stream, token_ids), - get_async(cuda_stream, d_vector), - bp_vector - )) - else: - backprops.append((token_ids, d_vector, bp_vector)) - self.transition_batch(states, scores) - todo = [st for st in todo if not st[0].is_final()] - # Tells CUDA to block, so our async copies complete. - if cuda_stream is not None: - cuda_stream.synchronize() - d_tokvecs = state2vec.ops.allocate(tokvecs.shape) - xp = state2vec.ops.xp # Handle for numpy/cupy - for token_ids, d_vector, bp_vector in backprops: - d_state_features = bp_vector(d_vector, sgd=sgd) - active_feats = token_ids * (token_ids >= 0) - active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1)) + xp = self.model[0].ops.xp if hasattr(xp, 'scatter_add'): - xp.scatter_add(d_tokvecs, - token_ids, d_state_features * active_feats) + xp.scatter_add(d_tokvecs, gpu_token_ids, d_features) else: - xp.add.at(d_tokvecs, - token_ids, d_state_features * active_feats) + xp.add.at(d_tokvecs, gpu_token_ids, d_features) return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) def get_batch_model(self, batch_size, tokvecs, stream, dropout): diff --git a/spacy/tests/regression/test_issue429.py b/spacy/tests/regression/test_issue429.py index 2782a0fb2..c5dc6989b 100644 --- a/spacy/tests/regression/test_issue429.py +++ b/spacy/tests/regression/test_issue429.py @@ -17,8 +17,9 @@ def test_issue429(EN): doc = EN('a') matcher = Matcher(EN.vocab) - matcher.add('TEST', on_match=merge_phrases, [{'ORTH': 'a'}]) - doc = EN.tokenizer('a b c') + matcher.add('TEST', [{'ORTH': 'a'}], on_match=merge_phrases) + doc = EN.make_doc('a b c') + EN.tagger(doc) matcher(doc) EN.entity(doc) diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 2f6764e06..9bbc9b24d 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -1,8 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals -from ...matcher import Matcher, PhraseMatcher -from ..util import get_doc +from ..matcher import Matcher, PhraseMatcher +from .util import get_doc import pytest