From 7f876a7a8284a6a0479beb57c265e13a4bec4ea0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 00:00:21 +0200 Subject: [PATCH 01/49] Clean up some unused code in parser --- spacy/syntax/nn_parser.pxd | 4 --- spacy/syntax/nn_parser.pyx | 50 ++++++-------------------------------- 2 files changed, 8 insertions(+), 46 deletions(-) diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd index 524718965..7ff4b9f9f 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/syntax/nn_parser.pxd @@ -14,8 +14,4 @@ cdef class Parser: cdef readonly TransitionSystem moves cdef readonly object cfg - cdef void _parse_step(self, StateC* state, - const float* feat_weights, - int nr_class, int nr_feat, int nr_piece) nogil - #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 0b39e2216..66787c22a 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -248,15 +248,10 @@ cdef class Parser: nI=token_vector_width) with Model.use_device('cpu'): - if depth == 0: - upper = chain() - upper.is_noop = True - else: - upper = chain( - clone(Maxout(hidden_width), (depth-1)), - zero_init(Affine(nr_class, drop_factor=0.0)) - ) - upper.is_noop = False + upper = chain( + clone(Maxout(hidden_width), (depth-1)), + zero_init(Affine(nr_class, drop_factor=0.0)) + ) # TODO: This is an unfortunate hack atm! # Used to set input dimensions in network. lower.begin_training(lower.ops.allocate((500, token_vector_width))) @@ -394,18 +389,11 @@ cdef class Parser: cdef np.ndarray scores c_token_ids = token_ids.data c_is_valid = is_valid.data - cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) while not next_step.empty(): - if not has_hidden: - for i in cython.parallel.prange( - next_step.size(), num_threads=6, nogil=True): - self._parse_step(next_step[i], - feat_weights, nr_class, nr_feat, nr_piece) - else: - for i in range(next_step.size()): - st = next_step[i] - st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) - self.moves.set_valid(&c_is_valid[i*nr_class], st) + for i in range(next_step.size()): + st = next_step[i] + st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) + self.moves.set_valid(&c_is_valid[i*nr_class], st) vectors = state2vec(token_ids[:next_step.size()]) scores = vec2scores(vectors) c_scores = scores.data @@ -461,28 +449,6 @@ cdef class Parser: beams.append(beam) return beams - cdef void _parse_step(self, StateC* state, - const float* feat_weights, - int nr_class, int nr_feat, int nr_piece) nogil: - '''This only works with no hidden layers -- fast but inaccurate''' - #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True): - # self._parse_step(next_step[i], feat_weights, nr_class, nr_feat) - token_ids = calloc(nr_feat, sizeof(int)) - scores = calloc(nr_class * nr_piece, sizeof(float)) - is_valid = calloc(nr_class, sizeof(int)) - - state.set_context_tokens(token_ids, nr_feat) - sum_state_features(scores, - feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece) - self.moves.set_valid(is_valid, state) - guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece) - action = self.moves.c[guess] - action.do(state, action.label) - - free(is_valid) - free(scores) - free(token_ids) - def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): if losses is not None and self.name not in losses: losses[self.name] = 0. From 468c138ab3923276d36ace9d03586709a5c7f187 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 01:13:23 +0200 Subject: [PATCH 02/49] WIP: Add fine-tuning logic to tagger model, re #1182 --- spacy/_ml.py | 70 ++++++++++++++++++++++++++++++++++++++++++++-- spacy/pipeline.pyx | 14 ++++------ 2 files changed, 73 insertions(+), 11 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index f1ded666e..5f8ce9470 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -19,7 +19,7 @@ from thinc.api import FeatureExtracter, with_getitem from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool from thinc.neural._classes.attention import ParametricAttention from thinc.linear.linear import LinearModel -from thinc.api import uniqued, wrap +from thinc.api import uniqued, wrap, flatten_add_lengths from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP from .tokens.doc import Doc @@ -53,6 +53,27 @@ def _logistic(X, drop=0.): return Y, logistic_bwd +@layerize +def add_tuples(X, drop=0.): + """Give inputs of sequence pairs, where each sequence is (vals, length), + sum the values, returning a single sequence. + + If input is: + ((vals1, length), (vals2, length) + Output is: + (vals1+vals2, length) + + vals are a single tensor for the whole batch. + """ + (vals1, length1), (vals2, length2) = X + assert length1 == length2 + + def add_tuples_bwd(dY, sgd=None): + return (dY, dY) + + return (vals1+vals2, length), add_tuples_bwd + + def _zero_init(model): def _zero_init_impl(self, X, y): self.W.fill(0) @@ -61,6 +82,7 @@ def _zero_init(model): model.W.fill(0.) return model + @layerize def _preprocess_doc(docs, drop=0.): keys = [doc.to_array([LOWER]) for doc in docs] @@ -72,7 +94,6 @@ def _preprocess_doc(docs, drop=0.): return (keys, vals, lengths), None - def _init_for_precomputed(W, ops): if (W**2).sum() != 0.: return @@ -80,6 +101,7 @@ def _init_for_precomputed(W, ops): ops.xavier_uniform_init(reshaped) W[:] = reshaped.reshape(W.shape) + @describe.on_data(_set_dimensions_if_needed) @describe.attributes( nI=Dimension("Input size"), @@ -323,6 +345,21 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): return vectors, backward +def fine_tune(model1, combine=None): + def fine_tune_fwd(docs, drop=0.): + X1, bp_X1 = model1.begin_update(docs) + lengths = [len(doc) for doc in docs] + X2 = model1.ops.flatten(X1) + + def fine_tune_bwd(d_output, sgd=None): + bp_X1(d_output, sgd=sgd) + return d_output + + return (X1+X2, lengths), fine_tune_bwd + model = wrap(fine_tune_fwd) + return model + + @layerize def flatten(seqs, drop=0.): if isinstance(seqs[0], numpy.ndarray): @@ -370,6 +407,35 @@ def preprocess_doc(docs, drop=0.): return (keys, vals, lengths), None +def build_tagger_model(nr_class, token_vector_width, **cfg): + with Model.define_operators({'>>': chain, '+': add}): + # Input: (doc, tensor) tuples + embed_docs = with_getitem(0, + FeatureExtracter([NORM]) + >> HashEmbed(token_vector_width, 1000) + >> flatten_add_lengths + ) + + model = ( + fine_tune(embed_docs) + >> + with_getitem(0, + FeatureExtracter([NORM]) + >> HashEmbed(token_vector_width, 1000) + >> flatten_add_lengths + ) + >> with_getitem(1, + flatten_add_lengths) + >> add_tuples + >> with_flatten( + Maxout(token_vector_width, token_vector_width) + >> Softmax(nr_class, token_vector_width) + ) + ) + return model + + + def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 200) with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}): diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 947f0a1f1..b96387351 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -42,7 +42,7 @@ from .compat import json_dumps from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats -from ._ml import build_text_classifier +from ._ml import build_text_classifier, build_tagger_model from .parts_of_speech import X @@ -346,10 +346,8 @@ class NeuralTagger(BaseThincComponent): @classmethod def Model(cls, n_tags, token_vector_width): - return with_flatten( - chain(Maxout(token_vector_width, token_vector_width), - Softmax(n_tags, token_vector_width))) - + return build_tagger_model(n_tags, token_vector_width) + def use_params(self, params): with self.model.use_params(params): yield @@ -455,10 +453,8 @@ class NeuralLabeller(NeuralTagger): @classmethod def Model(cls, n_tags, token_vector_width): - return with_flatten( - chain(Maxout(token_vector_width, token_vector_width), - Softmax(n_tags, token_vector_width))) - + return build_tagger_model(n_tags, token_vector_width) + def get_loss(self, docs, golds, scores): scores = self.model.ops.flatten(scores) cdef int idx = 0 From e9ab800e15ba45ba919387107aadb0cec388872a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 01:50:08 +0200 Subject: [PATCH 03/49] Fix tagging model --- spacy/_ml.py | 27 +++++++++------------------ spacy/pipeline.pyx | 12 +++++++----- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 5f8ce9470..e60e8a610 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -346,16 +346,16 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): def fine_tune(model1, combine=None): - def fine_tune_fwd(docs, drop=0.): + def fine_tune_fwd(docs_tokvecs, drop=0.): + docs, tokvecs = docs_tokvecs + lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') X1, bp_X1 = model1.begin_update(docs) - lengths = [len(doc) for doc in docs] - X2 = model1.ops.flatten(X1) def fine_tune_bwd(d_output, sgd=None): - bp_X1(d_output, sgd=sgd) + bp_X1(model1.ops.flatten(d_output), sgd=sgd) return d_output - return (X1+X2, lengths), fine_tune_bwd + return model1.ops.unflatten(X1+X2, lengths), fine_tune_bwd model = wrap(fine_tune_fwd) return model @@ -410,30 +410,21 @@ def preprocess_doc(docs, drop=0.): def build_tagger_model(nr_class, token_vector_width, **cfg): with Model.define_operators({'>>': chain, '+': add}): # Input: (doc, tensor) tuples - embed_docs = with_getitem(0, + embed_docs = ( FeatureExtracter([NORM]) + >> flatten >> HashEmbed(token_vector_width, 1000) - >> flatten_add_lengths ) model = ( fine_tune(embed_docs) - >> - with_getitem(0, - FeatureExtracter([NORM]) - >> HashEmbed(token_vector_width, 1000) - >> flatten_add_lengths - ) - >> with_getitem(1, - flatten_add_lengths) - >> add_tuples >> with_flatten( Maxout(token_vector_width, token_vector_width) >> Softmax(nr_class, token_vector_width) ) ) - return model - + model.nI = None + return model def build_text_classifier(nr_class, width=64, **cfg): diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index b96387351..848653c5c 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent): self.cfg = dict(cfg) def __call__(self, doc): - tags = self.predict([doc.tensor]) + tags = self.predict(([doc], [doc.tensor])) self.set_annotations([doc], tags) return doc def pipe(self, stream, batch_size=128, n_threads=-1): for docs in cytoolz.partition_all(batch_size, stream): + docs = list(docs) tokvecs = [d.tensor for d in docs] - tag_ids = self.predict(tokvecs) + tag_ids = self.predict((docs, tokvecs)) self.set_annotations(docs, tag_ids) yield from docs - def predict(self, tokvecs): - scores = self.model(tokvecs) + def predict(self, docs_tokvecs): + scores = self.model(docs_tokvecs) scores = self.model.ops.flatten(scores) guesses = scores.argmax(axis=1) if not isinstance(guesses, numpy.ndarray): guesses = guesses.get() + tokvecs = docs_tokvecs[1] guesses = self.model.ops.unflatten(guesses, [tv.shape[0] for tv in tokvecs]) return guesses @@ -295,7 +297,7 @@ class NeuralTagger(BaseThincComponent): if self.model.nI is None: self.model.nI = tokvecs[0].shape[1] - tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop) + tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) From 4cfb7a54e78c077dc6ac743ec7ccfe8a3b341ebd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 01:53:31 +0200 Subject: [PATCH 04/49] Fix tagger --- spacy/_ml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/_ml.py b/spacy/_ml.py index e60e8a610..c0025e597 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -350,6 +350,7 @@ def fine_tune(model1, combine=None): docs, tokvecs = docs_tokvecs lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') X1, bp_X1 = model1.begin_update(docs) + X2 = model1.ops.flatten(tokvecs) def fine_tune_bwd(d_output, sgd=None): bp_X1(model1.ops.flatten(d_output), sgd=sgd) From cc19ea0e7ca9c8adfb779dfafb1d534d55d78e5e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 02:17:10 +0200 Subject: [PATCH 05/49] Add update_tensors flag to Language.update. Experimental, re #1182 --- spacy/language.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 0284c4636..4a489387a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -277,7 +277,8 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, docs, golds, drop=0., sgd=None, losses=None, + update_tensors=False): """Update the models in the pipeline. docs (iterable): A batch of `Doc` objects. @@ -310,7 +311,7 @@ class Language(object): tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) d_tokvecses = proc.update((docs, tokvecses), golds, drop=drop, sgd=get_grads, losses=losses) - if d_tokvecses is not None: + if update_tensors and d_tokvecses is not None: bp_tokvecses(d_tokvecses, sgd=sgd) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) From 0a566dc320a1103569df90791e1ede24f513e78e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 02:18:12 +0200 Subject: [PATCH 06/49] Add update_tensors flag to Language.update. Experimental, re #1182 --- spacy/cli/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index af028dae5..9ed621c12 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -91,7 +91,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, for batch in minibatch(train_docs, size=batch_sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, - drop=next(dropout_rates), losses=losses) + drop=next(dropout_rates), losses=losses, + update_tensors=True) pbar.update(sum(len(doc) for doc in docs)) with nlp.use_params(optimizer.averages): From 0eec7c9e9b7bfafaa80633942088ce6016e3c918 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 02:18:31 +0200 Subject: [PATCH 07/49] Fix Language.evaluate --- spacy/language.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 4a489387a..3079249f6 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -382,9 +382,13 @@ class Language(object): return optimizer def evaluate(self, docs_golds): - docs, golds = zip(*docs_golds) scorer = Scorer() - for doc, gold in zip(self.pipe(docs, batch_size=32), golds): + docs, golds = zip(*docs_golds) + docs = list(docs) + golds = list(golds) + for pipe in self.pipeline: + docs = pipe.pipe(docs) + for doc, gold in zip(docs, golds): scorer.score(doc, gold) doc.tensor = None return scorer From bfffdeabb2ad16b65a1d5c2b0c0f088d47e7f7cc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 14:10:48 +0200 Subject: [PATCH 08/49] Fix parser batch-size bug introduced during cleanup --- spacy/syntax/nn_parser.pyx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 66787c22a..4be31b4de 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -339,12 +339,10 @@ cdef class Parser: The number of threads with which to work on the buffer in parallel. Yields (Doc): Documents, in order. """ - cdef StateClass parse_state cdef Doc doc - queue = [] for docs in cytoolz.partition_all(batch_size, docs): docs = list(docs) - tokvecs = [d.tensor for d in docs] + tokvecs = [doc.tensor for doc in docs] if beam_width == 1: parse_states = self.parse_batch(docs, tokvecs) else: @@ -364,6 +362,8 @@ cdef class Parser: int nr_class, nr_feat, nr_piece, nr_dim, nr_state if isinstance(docs, Doc): docs = [docs] + if isinstance(tokvecses, np.ndarray): + tokvecses = [tokvecses] tokvecs = self.model[0].ops.flatten(tokvecses) @@ -395,14 +395,14 @@ cdef class Parser: st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) self.moves.set_valid(&c_is_valid[i*nr_class], st) vectors = state2vec(token_ids[:next_step.size()]) - scores = vec2scores(vectors) - c_scores = scores.data - for i in range(next_step.size()): - st = next_step[i] - guess = arg_max_if_valid( - &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) - action = self.moves.c[guess] - action.do(st, action.label) + scores = vec2scores(vectors) + c_scores = scores.data + for i in range(next_step.size()): + st = next_step[i] + guess = arg_max_if_valid( + &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) + action = self.moves.c[guess] + action.do(st, action.label) this_step, next_step = next_step, this_step next_step.clear() for st in this_step: From 0acce0521b3768dce2029db28168b5ec79aac741 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 14:13:03 +0200 Subject: [PATCH 09/49] Fix Language.update for pipeline --- spacy/language.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 3079249f6..6d97f41fe 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -387,7 +387,12 @@ class Language(object): docs = list(docs) golds = list(golds) for pipe in self.pipeline: - docs = pipe.pipe(docs) + if not hasattr(pipe, 'pipe'): + for doc in docs: + pipe(doc) + else: + docs = list(pipe.pipe(docs)) + assert len(docs) == len(golds) for doc, gold in zip(docs, golds): scorer.score(doc, gold) doc.tensor = None From 3cb8f06881f5991e4de2be4520e201612debb911 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 14:15:14 +0200 Subject: [PATCH 10/49] Fix NeuralLabeller --- spacy/pipeline.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 848653c5c..b87f73c27 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -296,7 +296,6 @@ class NeuralTagger(BaseThincComponent): if self.model.nI is None: self.model.nI = tokvecs[0].shape[1] - tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) @@ -432,7 +431,7 @@ class NeuralLabeller(NeuralTagger): @property def labels(self): - return self.cfg.get('labels', {}) + return self.cfg.setdefault('labels', {}) @labels.setter def labels(self, value): From 4a5cc8913845accce3033bd751f503dd72d9c2ff Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 14:15:48 +0200 Subject: [PATCH 11/49] Fix tagger 'fine_tune', to keep private CNN weights --- spacy/_ml.py | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index c0025e597..bc08dfbbc 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -5,6 +5,7 @@ from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module import random +import cytoolz from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.static_vectors import StaticVectors @@ -207,9 +208,9 @@ class PrecomputableMaxouts(Model): def Tok2Vec(width, embed_size, preprocess=None): - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): - norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower') + norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower') prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix') suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix') shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape') @@ -218,7 +219,7 @@ def Tok2Vec(width, embed_size, preprocess=None): tok2vec = ( with_flatten( asarray(Model.ops, dtype='uint64') - >> embed + >> uniqued(embed, column=5) >> Maxout(width, width*4, pieces=3) >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) @@ -319,7 +320,7 @@ def zero_init(model): def doc2feats(cols=None): - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] def forward(docs, drop=0.): feats = [] for doc in docs: @@ -345,19 +346,26 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): return vectors, backward -def fine_tune(model1, combine=None): +def fine_tune(embedding, combine=None): + if combine is not None: + raise NotImplementedError( + "fine_tune currently only supports addition. Set combine=None") def fine_tune_fwd(docs_tokvecs, drop=0.): docs, tokvecs = docs_tokvecs lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') - X1, bp_X1 = model1.begin_update(docs) - X2 = model1.ops.flatten(tokvecs) + + vecs, bp_vecs = embedding.begin_update(docs, drop=drop) + + output = embedding.ops.unflatten( + embedding.ops.flatten(tokvecs) + + embedding.ops.flatten(vecs), + lengths) def fine_tune_bwd(d_output, sgd=None): - bp_X1(model1.ops.flatten(d_output), sgd=sgd) + bp_vecs(d_output, sgd=sgd) return d_output - - return model1.ops.unflatten(X1+X2, lengths), fine_tune_bwd - model = wrap(fine_tune_fwd) + return output, fine_tune_bwd + model = wrap(fine_tune_fwd, embedding) return model @@ -407,18 +415,18 @@ def preprocess_doc(docs, drop=0.): vals = ops.allocate(keys.shape[0]) + 1 return (keys, vals, lengths), None +def getitem(i): + def getitem_fwd(X, drop=0.): + return X[i], None + return layerize(getitem_fwd) def build_tagger_model(nr_class, token_vector_width, **cfg): with Model.define_operators({'>>': chain, '+': add}): # Input: (doc, tensor) tuples - embed_docs = ( - FeatureExtracter([NORM]) - >> flatten - >> HashEmbed(token_vector_width, 1000) - ) + private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats()) model = ( - fine_tune(embed_docs) + fine_tune(private_tok2vec) >> with_flatten( Maxout(token_vector_width, token_vector_width) >> Softmax(nr_class, token_vector_width) From 78498a072de1104baefe0e5b075303d24a7828f6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 14:16:36 +0200 Subject: [PATCH 12/49] Return Transition for missing actions in lookup_action --- spacy/syntax/arc_eager.pyx | 1 + spacy/syntax/transition_system.pyx | 2 ++ 2 files changed, 3 insertions(+) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 29e8de0aa..9477449a5 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -385,6 +385,7 @@ cdef class ArcEager(TransitionSystem): for i in range(self.n_moves): if self.c[i].move == move and self.c[i].label == label: return self.c[i] + return Transition(clas=0, move=MISSING, label=0) def move_name(self, int move, attr_t label): label_str = self.strings[label] diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 27b375bba..d3f64f827 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -107,6 +107,8 @@ cdef class TransitionSystem: def is_valid(self, StateClass stcls, move_name): action = self.lookup_transition(move_name) + if action.move == 0: + return False return action.is_valid(stcls.c, action.label) cdef int set_valid(self, int* is_valid, const StateC* st) nogil: From 3ed203de2504edd2b5470ecfa4ef8a5b2e382b2a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 18:33:18 +0200 Subject: [PATCH 13/49] Use LayerNorm and SELU in Tok2Vec --- spacy/_ml.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index bc08dfbbc..f7ab9b259 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -10,6 +10,7 @@ import cytoolz from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.static_vectors import StaticVectors from thinc.neural._classes.batchnorm import BatchNorm +from thinc.neural._classes.layernorm import LayerNorm as LN from thinc.neural._classes.resnet import Residual from thinc.neural import ReLu from thinc.neural._classes.selu import SELU @@ -220,11 +221,11 @@ def Tok2Vec(width, embed_size, preprocess=None): with_flatten( asarray(Model.ops, dtype='uint64') >> uniqued(embed, column=5) - >> Maxout(width, width*4, pieces=3) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)), + >> LN(Maxout(width, width*4, pieces=3)) + >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)), pad=4) ) if preprocess not in (False, None): From 42bd26f6f397c5234b9b82c3daa14c3c0268bd3c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Aug 2017 18:33:46 +0200 Subject: [PATCH 14/49] Give parser its own tok2vec weights --- spacy/syntax/nn_parser.pyx | 40 +++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 4be31b4de..06c61656b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -237,6 +237,7 @@ cdef class Parser: token_vector_width = util.env_opt('token_vector_width', token_vector_width) hidden_width = util.env_opt('hidden_width', hidden_width) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) + tensors = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats()) if parser_maxout_pieces == 1: lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, nF=cls.nr_feature, @@ -263,7 +264,7 @@ cdef class Parser: 'hidden_width': hidden_width, 'maxout_pieces': parser_maxout_pieces } - return (lower, upper), cfg + return (tensors, lower, upper), cfg def __init__(self, Vocab vocab, moves=True, model=True, **cfg): """ @@ -366,6 +367,7 @@ cdef class Parser: tokvecses = [tokvecses] tokvecs = self.model[0].ops.flatten(tokvecses) + tokvecs += self.model[0].ops.flatten(self.model[0](docs)) nr_state = len(docs) nr_class = self.moves.n_moves @@ -417,6 +419,7 @@ cdef class Parser: cdef int nr_class = self.moves.n_moves cdef StateClass stcls, output tokvecs = self.model[0].ops.flatten(tokvecses) + tokvecs += self.model[0].ops.flatten(self.model[0](docs)) cuda_stream = get_cuda_stream() state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, cuda_stream, 0.0) @@ -457,6 +460,9 @@ cdef class Parser: if isinstance(docs, Doc) and isinstance(golds, GoldParse): docs = [docs] golds = [golds] + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs, drop=0.) + my_tokvecs = self.model[0].ops.flatten(my_tokvecs) + tokvecs += my_tokvecs cuda_stream = get_cuda_stream() @@ -506,7 +512,9 @@ cdef class Parser: break self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) - return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) + d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) + #bp_my_tokvecs(d_tokvecs, sgd=sgd) + return d_tokvecs def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long @@ -569,7 +577,7 @@ cdef class Parser: return names def get_batch_model(self, batch_size, tokvecs, stream, dropout): - lower, upper = self.model + _, lower, upper = self.model state2vec = precompute_hiddens(batch_size, tokvecs, lower, stream, drop=dropout) return state2vec, upper @@ -659,10 +667,12 @@ cdef class Parser: def to_disk(self, path, **exclude): serializers = { - 'lower_model': lambda p: p.open('wb').write( + 'tok2vec_model': lambda p: p.open('wb').write( self.model[0].to_bytes()), - 'upper_model': lambda p: p.open('wb').write( + 'lower_model': lambda p: p.open('wb').write( self.model[1].to_bytes()), + 'upper_model': lambda p: p.open('wb').write( + self.model[2].to_bytes()), 'vocab': lambda p: self.vocab.to_disk(p), 'moves': lambda p: self.moves.to_disk(p, strings=False), 'cfg': lambda p: p.open('w').write(json_dumps(self.cfg)) @@ -683,24 +693,29 @@ cdef class Parser: self.model, cfg = self.Model(**self.cfg) else: cfg = {} - with (path / 'lower_model').open('rb') as file_: + with (path / 'tok2vec_model').open('rb') as file_: bytes_data = file_.read() self.model[0].from_bytes(bytes_data) - with (path / 'upper_model').open('rb') as file_: + with (path / 'lower_model').open('rb') as file_: bytes_data = file_.read() self.model[1].from_bytes(bytes_data) + with (path / 'upper_model').open('rb') as file_: + bytes_data = file_.read() + self.model[2].from_bytes(bytes_data) self.cfg.update(cfg) return self def to_bytes(self, **exclude): serializers = OrderedDict(( - ('lower_model', lambda: self.model[0].to_bytes()), - ('upper_model', lambda: self.model[1].to_bytes()), + ('tok2vec_model', lambda: self.model[0].to_bytes()), + ('lower_model', lambda: self.model[1].to_bytes()), + ('upper_model', lambda: self.model[2].to_bytes()), ('vocab', lambda: self.vocab.to_bytes()), ('moves', lambda: self.moves.to_bytes(strings=False)), ('cfg', lambda: ujson.dumps(self.cfg)) )) if 'model' in exclude: + exclude['tok2vec_model'] = True exclude['lower_model'] = True exclude['upper_model'] = True exclude.pop('model') @@ -711,6 +726,7 @@ cdef class Parser: ('vocab', lambda b: self.vocab.from_bytes(b)), ('moves', lambda b: self.moves.from_bytes(b, strings=False)), ('cfg', lambda b: self.cfg.update(ujson.loads(b))), + ('tok2vec_model', lambda b: None), ('lower_model', lambda b: None), ('upper_model', lambda b: None) )) @@ -720,10 +736,12 @@ cdef class Parser: self.model, cfg = self.Model(self.moves.n_moves) else: cfg = {} + if 'tok2vec_model' in msg: + self.model[0].from_bytes(msg['tok2vec_model']) if 'lower_model' in msg: - self.model[0].from_bytes(msg['lower_model']) + self.model[1].from_bytes(msg['lower_model']) if 'upper_model' in msg: - self.model[1].from_bytes(msg['upper_model']) + self.model[2].from_bytes(msg['upper_model']) self.cfg.update(cfg) return self From 5d837c37762cb06a230906be80225e0e421c6cb2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Aug 2017 06:32:59 -0500 Subject: [PATCH 15/49] Add mix weights on fine_tune --- spacy/_ml.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index f7ab9b259..d28f48c42 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -356,17 +356,24 @@ def fine_tune(embedding, combine=None): lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') vecs, bp_vecs = embedding.begin_update(docs, drop=drop) - + flat_tokvecs = embedding.ops.flatten(tokvecs) + flat_vecs = embedding.ops.flatten(vecs) output = embedding.ops.unflatten( - embedding.ops.flatten(tokvecs) - + embedding.ops.flatten(vecs), + (model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs), lengths) def fine_tune_bwd(d_output, sgd=None): bp_vecs(d_output, sgd=sgd) + flat_grad = model.ops.flatten(d_output) + model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum() + model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum() + sgd(model._mem.weights, model._mem.gradient, key=model.id) return d_output return output, fine_tune_bwd model = wrap(fine_tune_fwd, embedding) + model.mix = model._mem.add((model.id, 'mix'), (2,)) + model.mix.fill(1.) + model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix')) return model From 88bf1cf87c874c2e9fa0d88aa28db07907b6ad90 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 8 Aug 2017 15:34:17 -0500 Subject: [PATCH 16/49] Update parser for fine tuning --- spacy/_ml.py | 18 +++++++++--------- spacy/syntax/nn_parser.pyx | 10 +++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index d28f48c42..01f166b9f 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -222,11 +222,11 @@ def Tok2Vec(width, embed_size, preprocess=None): asarray(Model.ops, dtype='uint64') >> uniqued(embed, column=5) >> LN(Maxout(width, width*4, pieces=3)) - >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)), - pad=4) + >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) + >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)), + pad=4) ) if preprocess not in (False, None): tok2vec = preprocess >> tok2vec @@ -432,8 +432,8 @@ def build_tagger_model(nr_class, token_vector_width, **cfg): with Model.define_operators({'>>': chain, '+': add}): # Input: (doc, tensor) tuples private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats()) - - model = ( + + model = ( fine_tune(private_tok2vec) >> with_flatten( Maxout(token_vector_width, token_vector_width) @@ -457,7 +457,7 @@ def build_text_classifier(nr_class, width=64, **cfg): >> _flatten_add_lengths >> with_getitem(0, uniqued( - (embed_lower | embed_prefix | embed_suffix | embed_shape) + (embed_lower | embed_prefix | embed_suffix | embed_shape) >> Maxout(width, width+(width//2)*3)) >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) @@ -478,7 +478,7 @@ def build_text_classifier(nr_class, width=64, **cfg): >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) >> logistic ) - + model.lsuv = False return model diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 06c61656b..00835f697 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -44,7 +44,7 @@ from thinc.neural.util import get_array_module from .. import util from ..util import get_async, get_cuda_stream from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts -from .._ml import Tok2Vec, doc2feats, rebatch +from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune from ..compat import json_dumps from . import _parse_features @@ -237,7 +237,7 @@ cdef class Parser: token_vector_width = util.env_opt('token_vector_width', token_vector_width) hidden_width = util.env_opt('hidden_width', hidden_width) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) - tensors = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats()) + tensors = fine_tune(Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())) if parser_maxout_pieces == 1: lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, nF=cls.nr_feature, @@ -367,7 +367,7 @@ cdef class Parser: tokvecses = [tokvecses] tokvecs = self.model[0].ops.flatten(tokvecses) - tokvecs += self.model[0].ops.flatten(self.model[0](docs)) + tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) nr_state = len(docs) nr_class = self.moves.n_moves @@ -419,7 +419,7 @@ cdef class Parser: cdef int nr_class = self.moves.n_moves cdef StateClass stcls, output tokvecs = self.model[0].ops.flatten(tokvecses) - tokvecs += self.model[0].ops.flatten(self.model[0](docs)) + tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) cuda_stream = get_cuda_stream() state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, cuda_stream, 0.0) @@ -460,7 +460,7 @@ cdef class Parser: if isinstance(docs, Doc) and isinstance(golds, GoldParse): docs = [docs] golds = [golds] - my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs, drop=0.) + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.) my_tokvecs = self.model[0].ops.flatten(my_tokvecs) tokvecs += my_tokvecs From dbdd8afc4bb4fa56db69ddca584df7505888e46b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 8 Aug 2017 15:46:07 -0500 Subject: [PATCH 17/49] Fix parser fine-tune training --- spacy/syntax/nn_parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 00835f697..31c3801a2 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -513,7 +513,7 @@ cdef class Parser: self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) - #bp_my_tokvecs(d_tokvecs, sgd=sgd) + bp_my_tokvecs(d_tokvecs, sgd=sgd) return d_tokvecs def _init_gold_batch(self, whole_docs, whole_golds): From 28e2fec23bf5f654490c8d8f17d551fda190e831 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 9 Aug 2017 11:52:38 +0200 Subject: [PATCH 18/49] Fix autolinking failure on fresh model install (resolves #1138) On fresh install via subprocess, pip.get_installed_distributions() won't show new model, so is_package check in link command fails. Solution for now is to get model package path explicitly and pass it to link command. --- spacy/cli/download.py | 8 ++++++-- spacy/cli/link.py | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index b6e5549da..675ae8cee 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -8,7 +8,7 @@ import subprocess import sys from .link import link -from ..util import prints +from ..util import prints, get_package_path from .. import about @@ -32,7 +32,11 @@ def download(cmd, model, direct=False): version = get_version(model_name, compatibility) download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) try: - link(None, model_name, model, force=True) + # Get package path here because link uses + # pip.get_installed_distributions() to check if model is a package, + # which fails if model was just installed via subprocess + package_path = get_package_path(model_name) + link(None, model_name, model, force=True, model_path=package_path) except: # Dirty, but since spacy.download and the auto-linking is mostly # a convenience wrapper, it's best to show a success message and diff --git a/spacy/cli/link.py b/spacy/cli/link.py index a8ee01565..712a05aee 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -14,7 +14,7 @@ from .. import util link_name=("name of shortuct link to create", "positional", None, str), force=("force overwriting of existing link", "flag", "f", bool) ) -def link(cmd, origin, link_name, force=False): +def link(cmd, origin, link_name, force=False, model_path=None): """ Create a symlink for models within the spacy/data directory. Accepts either the name of a pip package, or the local path to the model data @@ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False): if util.is_package(origin): model_path = util.get_package_path(origin) else: - model_path = Path(origin) + model_path = Path(origin) if model_path is None else Path(model_path) if not model_path.exists(): prints("The data should be located in %s" % path2str(model_path), title="Can't locate model data", exits=1) From 764540a6dd36b4a51fc6b9f28786aa5ffeaee202 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 9 Aug 2017 12:16:30 +0200 Subject: [PATCH 19/49] Don't ignore /bin directory --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 52838918c..cb0a8e84e 100644 --- a/.gitignore +++ b/.gitignore @@ -40,7 +40,6 @@ venv/ # Distribution / packaging env/ -bin/ build/ develop-eggs/ dist/ From 495e0424291e95846fcccb679c938a0a1e8f6ff1 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 9 Aug 2017 12:17:30 +0200 Subject: [PATCH 20/49] Add entry point-style auto alias for "spacy" Simplest way to run commands as spacy xxx instead of python -m spacy xxx, while avoiding environment conflicts --- MANIFEST.in | 1 + bin/spacy | 1 + setup.py | 1 + 3 files changed, 3 insertions(+) create mode 100644 bin/spacy diff --git a/MANIFEST.in b/MANIFEST.in index 697748835..4d804a23e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ recursive-include include *.h include LICENSE include README.rst +include bin/spacy diff --git a/bin/spacy b/bin/spacy new file mode 100644 index 000000000..29d9a80e5 --- /dev/null +++ b/bin/spacy @@ -0,0 +1 @@ +python -m spacy "$@" diff --git a/setup.py b/setup.py index ecdf15536..0a3384ed5 100755 --- a/setup.py +++ b/setup.py @@ -187,6 +187,7 @@ def setup_package(): url=about['__uri__'], license=about['__license__'], ext_modules=ext_modules, + scripts=['bin/spacy'], install_requires=[ 'numpy>=1.7', 'murmurhash>=0.28,<0.29', From bcce6f7de0d03c86c5c189381d00de16b6cdbb19 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Aug 2017 16:23:12 -0500 Subject: [PATCH 21/49] Fix parser fine tuning --- spacy/syntax/nn_parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 00835f697..31c3801a2 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -513,7 +513,7 @@ cdef class Parser: self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) - #bp_my_tokvecs(d_tokvecs, sgd=sgd) + bp_my_tokvecs(d_tokvecs, sgd=sgd) return d_tokvecs def _init_gold_batch(self, whole_docs, whole_golds): From bbace204bec8160936ff8ce9b50b8194b5d94a23 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Aug 2017 16:40:42 -0500 Subject: [PATCH 22/49] Gate parser fine-tuning behind feature flag --- spacy/syntax/nn_parser.pyx | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 31c3801a2..f1f21134c 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -59,8 +59,9 @@ from ..structs cimport TokenC from ..tokens.doc cimport Doc from ..strings cimport StringStore from ..gold cimport GoldParse -from ..attrs cimport TAG, DEP +from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG +USE_FINE_TUNE = True def get_templates(*args, **kwargs): return [] @@ -237,7 +238,8 @@ cdef class Parser: token_vector_width = util.env_opt('token_vector_width', token_vector_width) hidden_width = util.env_opt('hidden_width', hidden_width) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) - tensors = fine_tune(Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())) + tensors = fine_tune(Tok2Vec(token_vector_width, 7500, + preprocess=doc2feats(cols=[ID, NORM, PREFIX, SUFFIX, TAG]))) if parser_maxout_pieces == 1: lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, nF=cls.nr_feature, @@ -367,7 +369,8 @@ cdef class Parser: tokvecses = [tokvecses] tokvecs = self.model[0].ops.flatten(tokvecses) - tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) + if USE_FINE_TUNE: + tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) nr_state = len(docs) nr_class = self.moves.n_moves @@ -419,7 +422,8 @@ cdef class Parser: cdef int nr_class = self.moves.n_moves cdef StateClass stcls, output tokvecs = self.model[0].ops.flatten(tokvecses) - tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) + if USE_FINE_TUNE: + tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) cuda_stream = get_cuda_stream() state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, cuda_stream, 0.0) @@ -460,9 +464,10 @@ cdef class Parser: if isinstance(docs, Doc) and isinstance(golds, GoldParse): docs = [docs] golds = [golds] - my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.) - my_tokvecs = self.model[0].ops.flatten(my_tokvecs) - tokvecs += my_tokvecs + if USE_FINE_TUNE: + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.) + my_tokvecs = self.model[0].ops.flatten(my_tokvecs) + tokvecs += my_tokvecs cuda_stream = get_cuda_stream() @@ -513,7 +518,8 @@ cdef class Parser: self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) - bp_my_tokvecs(d_tokvecs, sgd=sgd) + if USE_FINE_TUNE: + bp_my_tokvecs(d_tokvecs, sgd=sgd) return d_tokvecs def _init_gold_batch(self, whole_docs, whole_golds): From ac2de6dced7a3fa3d224487c61885b334c493392 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Aug 2017 16:41:25 -0500 Subject: [PATCH 23/49] Switch to ReLu layers in Tok2Vec --- spacy/_ml.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 01f166b9f..d08a43b8e 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -220,12 +220,12 @@ def Tok2Vec(width, embed_size, preprocess=None): tok2vec = ( with_flatten( asarray(Model.ops, dtype='uint64') - >> uniqued(embed, column=5) + >> embed >> LN(Maxout(width, width*4, pieces=3)) - >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)), + >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)), pad=4) ) if preprocess not in (False, None): @@ -321,7 +321,8 @@ def zero_init(model): def doc2feats(cols=None): - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] + if cols is None: + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] def forward(docs, drop=0.): feats = [] for doc in docs: From f93f2bed58a5caa8bdfba78c3c3f035c97c790e7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Aug 2017 17:47:03 -0500 Subject: [PATCH 24/49] Revert use of layer normalization in Tok2Vec --- spacy/_ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index d08a43b8e..39041cc22 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -221,7 +221,7 @@ def Tok2Vec(width, embed_size, preprocess=None): with_flatten( asarray(Model.ops, dtype='uint64') >> embed - >> LN(Maxout(width, width*4, pieces=3)) + >> Maxout(width, width*4, pieces=3) >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) From f37528ef58c89988eaa8c046d6dd0f0e6144a378 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Aug 2017 17:52:53 -0500 Subject: [PATCH 25/49] Pass embed size for parser fine-tune. Use SELU --- spacy/syntax/nn_parser.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index f1f21134c..eb6117167 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -37,7 +37,8 @@ from preshed.maps cimport MapStruct from preshed.maps cimport map_get from thinc.api import layerize, chain, noop, clone -from thinc.neural import Model, Affine, ELU, ReLu, Maxout +from thinc.neural import Model, Affine, ReLu, Maxout +from thinc.neural._classes.selu import SELU from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module @@ -238,8 +239,9 @@ cdef class Parser: token_vector_width = util.env_opt('token_vector_width', token_vector_width) hidden_width = util.env_opt('hidden_width', hidden_width) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) - tensors = fine_tune(Tok2Vec(token_vector_width, 7500, - preprocess=doc2feats(cols=[ID, NORM, PREFIX, SUFFIX, TAG]))) + embed_size = util.env_opt('embed_size', 7500) + tensors = fine_tune(Tok2Vec(token_vector_width, embed_size, + preprocess=doc2feats(cols=[ID, NORM, PREFIX, SUFFIX, SHAPE]))) if parser_maxout_pieces == 1: lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, nF=cls.nr_feature, @@ -252,7 +254,7 @@ cdef class Parser: with Model.use_device('cpu'): upper = chain( - clone(Maxout(hidden_width), (depth-1)), + clone(SELU(hidden_width), (depth-1)), zero_init(Affine(nr_class, drop_factor=0.0)) ) # TODO: This is an unfortunate hack atm! From d01dc3704a5339cfd1f576a83f761ba9d6e62e7a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Aug 2017 20:06:33 -0500 Subject: [PATCH 26/49] Adjust parser model --- spacy/syntax/nn_parser.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index eb6117167..a94b94e83 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -241,7 +241,7 @@ cdef class Parser: parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) embed_size = util.env_opt('embed_size', 7500) tensors = fine_tune(Tok2Vec(token_vector_width, embed_size, - preprocess=doc2feats(cols=[ID, NORM, PREFIX, SUFFIX, SHAPE]))) + preprocess=doc2feats())) if parser_maxout_pieces == 1: lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, nF=cls.nr_feature, @@ -254,7 +254,7 @@ cdef class Parser: with Model.use_device('cpu'): upper = chain( - clone(SELU(hidden_width), (depth-1)), + clone(Maxout(hidden_width), (depth-1)), zero_init(Affine(nr_class, drop_factor=0.0)) ) # TODO: This is an unfortunate hack atm! From d42a03b8ded67eabac13122ede58aa5f9e2dd447 Mon Sep 17 00:00:00 2001 From: Nikolai Kruglikov Date: Thu, 10 Aug 2017 14:38:30 +0500 Subject: [PATCH 27/49] Fix small typo in documentation --- website/docs/usage/adding-languages.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index a0b77ad17..4cd65a62d 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -205,7 +205,7 @@ p +infobox("Why lazy-loading?") | Some languages contain large volumes of custom data, like lemmatizer - | loopup tables, or complex regular expression that are expensive to + | lookup tables, or complex regular expression that are expensive to | compute. As of spaCy v2.0, #[code Language] classes are not imported on | initialisation and are only loaded when you import them directly, or load | a model that requires a language to be loaded. To lazy-load languages in From 1a59db1c86537c54b8b59e3a2988c6a24749b7f2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 05:44:39 -0500 Subject: [PATCH 28/49] Fix dropout and learn rate in parser --- spacy/syntax/nn_parser.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index a94b94e83..201b988b9 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -39,6 +39,7 @@ from preshed.maps cimport map_get from thinc.api import layerize, chain, noop, clone from thinc.neural import Model, Affine, ReLu, Maxout from thinc.neural._classes.selu import SELU +from thinc.neural._classes.layernorm import LayerNorm from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module @@ -467,7 +468,7 @@ cdef class Parser: docs = [docs] golds = [golds] if USE_FINE_TUNE: - my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.) + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) my_tokvecs = self.model[0].ops.flatten(my_tokvecs) tokvecs += my_tokvecs @@ -496,13 +497,13 @@ cdef class Parser: scores, bp_scores = vec2scores.begin_update(vector, drop=drop) d_scores = self.get_batch_loss(states, golds, scores) - d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd) + d_vector = bp_scores(d_scores, sgd=sgd) if drop != 0: d_vector *= mask if isinstance(self.model[0].ops, CupyOps) \ and not isinstance(token_ids, state2vec.ops.xp.ndarray): - # Move token_ids and d_vector to CPU, asynchronously + # Move token_ids and d_vector to GPU, asynchronously backprops.append(( get_async(cuda_stream, token_ids), get_async(cuda_stream, d_vector), From ebe0f7f6418927e92086c1d408c6c9622682efcb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 05:45:20 -0500 Subject: [PATCH 29/49] Pass embed size correctly in tagger, and cache embeddings for efficiency --- spacy/_ml.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 39041cc22..33c6f378b 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -23,8 +23,10 @@ from thinc.neural._classes.attention import ParametricAttention from thinc.linear.linear import LinearModel from thinc.api import uniqued, wrap, flatten_add_lengths + from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP from .tokens.doc import Doc +from . import util import numpy import io @@ -208,6 +210,17 @@ class PrecomputableMaxouts(Model): return Yfp, backward +def drop_layer(layer, factor=1.0): + def drop_layer_fwd(X, drop=0.): + drop *= factor + mask = layer.ops.get_dropout_mask((1,), drop) + if mask is not None and mask[0] == 0.: + return X, lambda dX, sgd=None: dX + else: + return layer.begin_update(X, drop=drop) + return wrap(drop_layer_fwd, layer) + + def Tok2Vec(width, embed_size, preprocess=None): cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): @@ -220,13 +233,13 @@ def Tok2Vec(width, embed_size, preprocess=None): tok2vec = ( with_flatten( asarray(Model.ops, dtype='uint64') - >> embed - >> Maxout(width, width*4, pieces=3) - >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)), - pad=4) + >> uniqued(embed >> Maxout(width, width*4, pieces=3), column=5) + >> Residual( + (ExtractWindow(nW=1) >> ReLu(width, width*3)) + >> (ExtractWindow(nW=1) >> ReLu(width, width*3)) + >> (ExtractWindow(nW=1) >> ReLu(width, width*3)) + >> (ExtractWindow(nW=1) >> ReLu(width, width*3)) + ), pad=4) ) if preprocess not in (False, None): tok2vec = preprocess >> tok2vec @@ -430,9 +443,10 @@ def getitem(i): return layerize(getitem_fwd) def build_tagger_model(nr_class, token_vector_width, **cfg): + embed_size = util.env_opt('embed_size', 7500) with Model.define_operators({'>>': chain, '+': add}): # Input: (doc, tensor) tuples - private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats()) + private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats()) model = ( fine_tune(private_tok2vec) From 680043ebca7b695933d4935e6d189c54e27fa087 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 08:54:21 -0500 Subject: [PATCH 30/49] Improve efficiency of tagger.set_annotations for GPU --- spacy/pipeline.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index b87f73c27..f367d2b5b 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -284,6 +284,8 @@ class NeuralTagger(BaseThincComponent): cdef Vocab vocab = self.vocab for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] + if hasattr(doc_tag_ids, 'get'): + doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): # Don't clobber preset POS tags if doc.c[j].tag == 0 and doc.c[j].pos == 0: From 8870d491f1f4c1b50791484d234c2890f225abef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 08:55:53 -0500 Subject: [PATCH 31/49] Remove redundant pickling during training --- spacy/cli/train.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 9ed621c12..04aac8319 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -99,8 +99,6 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, util.set_env_log(False) epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) - with (output_path / ('model%d.pickle' % i)).open('wb') as file_: - dill.dump(nlp, file_, -1) nlp_loaded = lang_class(pipeline=pipeline) nlp_loaded = nlp_loaded.from_disk(epoch_model_path) scorer = nlp_loaded.evaluate( From cd5ecedf6a02c0ce1fe2c2157e2281751cec98cb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 08:56:33 -0500 Subject: [PATCH 32/49] Try drop_layer in parser --- spacy/syntax/nn_parser.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 201b988b9..bd56ba40b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -47,6 +47,7 @@ from .. import util from ..util import get_async, get_cuda_stream from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune +from .._ml import Residual, drop_layer from ..compat import json_dumps from . import _parse_features @@ -255,7 +256,7 @@ cdef class Parser: with Model.use_device('cpu'): upper = chain( - clone(Maxout(hidden_width), (depth-1)), + clone(drop_layer(Residual(Maxout(hidden_width))), (depth-1)), zero_init(Affine(nr_class, drop_factor=0.0)) ) # TODO: This is an unfortunate hack atm! From 4ab0c8c8e9b3320675e6a5e20d39db0be7fa0210 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 08:56:57 -0500 Subject: [PATCH 33/49] Try different drop_layer structure in Tok2Vec --- spacy/_ml.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 33c6f378b..e37bcac52 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -210,14 +210,14 @@ class PrecomputableMaxouts(Model): return Yfp, backward -def drop_layer(layer, factor=1.0): +def drop_layer(layer, factor=2.): def drop_layer_fwd(X, drop=0.): drop *= factor mask = layer.ops.get_dropout_mask((1,), drop) - if mask is not None and mask[0] == 0.: - return X, lambda dX, sgd=None: dX - else: + if mask is None or mask > 0: return layer.begin_update(X, drop=drop) + else: + return X, lambda dX, sgd=None: dX return wrap(drop_layer_fwd, layer) @@ -229,17 +229,17 @@ def Tok2Vec(width, embed_size, preprocess=None): suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix') shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape') - embed = (norm | prefix | suffix | shape ) + embed = (norm | prefix | suffix | shape ) >> Maxout(width, width*4, pieces=3) tok2vec = ( with_flatten( asarray(Model.ops, dtype='uint64') - >> uniqued(embed >> Maxout(width, width*4, pieces=3), column=5) - >> Residual( - (ExtractWindow(nW=1) >> ReLu(width, width*3)) - >> (ExtractWindow(nW=1) >> ReLu(width, width*3)) - >> (ExtractWindow(nW=1) >> ReLu(width, width*3)) - >> (ExtractWindow(nW=1) >> ReLu(width, width*3)) - ), pad=4) + >> uniqued(embed, column=5) + >> drop_layer( + Residual( + (ExtractWindow(nW=1) >> ReLu(width, width*3)) + ) + ) ** 4, pad=4 + ) ) if preprocess not in (False, None): tok2vec = preprocess >> tok2vec From d4f2baf7dd7f0136916aa54c5d2af3ce12a43495 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 12 Aug 2017 21:44:15 +0200 Subject: [PATCH 34/49] Add create_meta option to package command Re-create meta.json in model directory, even if it exists. Especially useful when updating existing spaCy models or training with Prodigy. Ensures user won't end up with multiple "en_core_web_sm" models, and offers easy way to change the model's name and settings without having to edit the meta.json file. --- spacy/cli/package.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 1c720c2b5..9be28d4aa 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -15,10 +15,11 @@ from .. import about @plac.annotations( input_dir=("directory with model data", "positional", None, str), output_dir=("output parent directory", "positional", None, str), - meta=("path to meta.json", "option", "m", str), + meta_path=("path to meta.json", "option", "m", str), + create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool), force=("force overwriting of existing folder in output directory", "flag", "f", bool) ) -def package(cmd, input_dir, output_dir, meta=None, force=False): +def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified @@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False): """ input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) - meta_path = util.ensure_path(meta) + meta_path = util.ensure_path(meta_path) if not input_path or not input_path.exists(): prints(input_path, title="Model directory not found", exits=1) if not output_path or not output_path.exists(): @@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False): template_manifest = get_template('MANIFEST.in') template_init = get_template('xx_model_name/__init__.py') meta_path = meta_path or input_path / 'meta.json' - if meta_path.is_file(): + if not create_meta and meta_path.is_file(): prints(meta_path, title="Reading meta.json from file") meta = util.read_json(meta_path) else: From b353e4d843be9eb55bc89927df1e4d4ec099dc21 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 14:47:45 -0500 Subject: [PATCH 35/49] Work on parser beam training --- setup.py | 1 + spacy/syntax/_beam_utils.pyx | 196 +++++++++++++++++++++++++++++ spacy/syntax/nn_parser.pyx | 27 +++- spacy/tests/parser/test_nn_beam.py | 98 +++++++++++++++ 4 files changed, 321 insertions(+), 1 deletion(-) create mode 100644 spacy/syntax/_beam_utils.pyx create mode 100644 spacy/tests/parser/test_nn_beam.py diff --git a/setup.py b/setup.py index 0a3384ed5..02d4fe0d9 100755 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ MOD_NAMES = [ 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', 'spacy.syntax._parse_features', + 'spacy.syntax._beam_utils', 'spacy.gold', 'spacy.tokens.doc', 'spacy.tokens.span', diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx new file mode 100644 index 000000000..4a4b79dad --- /dev/null +++ b/spacy/syntax/_beam_utils.pyx @@ -0,0 +1,196 @@ +# cython: infer_types=True +cimport numpy as np +import numpy +from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF +from thinc.extra.search cimport Beam +from thinc.extra.search import MaxViolation +from thinc.typedefs cimport hash_t, class_t + +from .transition_system cimport TransitionSystem, Transition +from .stateclass cimport StateClass +from ..gold cimport GoldParse +from ..tokens.doc cimport Doc + + +# These are passed as callbacks to thinc.search.Beam +cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: + dest = _dest + src = _src + moves = _moves + dest.clone(src) + moves[clas].do(dest.c, moves[clas].label) + + +cdef int _check_final_state(void* _state, void* extra_args) except -1: + return (_state).is_final() + + +def _cleanup(Beam beam): + for i in range(beam.width): + Py_XDECREF(beam._states[i].content) + Py_XDECREF(beam._parents[i].content) + + +cdef hash_t _hash_state(void* _state, void* _) except 0: + state = _state + if state.c.is_final(): + return 1 + else: + return state.c.hash() + + +cdef class ParserBeam(object): + cdef public TransitionSystem moves + cdef public object docs + cdef public object golds + cdef public object beams + + def __init__(self, TransitionSystem moves, docs, golds, + int width=4, float density=0.001): + self.moves = moves + self.docs = docs + self.golds = golds + self.beams = [] + cdef Doc doc + cdef Beam beam + for doc in docs: + beam = Beam(self.moves.n_moves, width, density) + beam.initialize(self.moves.init_beam_state, doc.length, doc.c) + self.beams.append(beam) + + @property + def is_done(self): + return all(beam.is_done for beam in self.beams) + + def __getitem__(self, i): + return self.beams[i] + + def __len__(self): + return len(self.beams) + + def advance(self, scores, follow_gold=False): + cdef Beam beam + for i, beam in enumerate(self.beams): + self._set_scores(beam, scores[i]) + if self.golds is not None: + self._set_costs(beam, self.golds[i], follow_gold=follow_gold) + if follow_gold: + assert self.golds is not None + beam.advance(_transition_state, NULL, self.moves.c) + else: + beam.advance(_transition_state, _hash_state, self.moves.c) + beam.check_done(_check_final_state, NULL) + + def _set_scores(self, Beam beam, scores): + for i in range(beam.size): + state = beam.at(i) + for j in range(beam.nr_class): + beam.scores[i][j] = scores[i, j] + self.moves.set_valid(beam.is_valid[i], state.c) + + def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False): + for i in range(beam.size): + state = beam.at(i) + self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) + if follow_gold: + for j in range(beam.nr_class): + beam.is_valid[i][j] *= beam.costs[i][j] <= 0 + + +def get_token_ids(states, int n_tokens): + cdef StateClass state + cdef np.ndarray ids = numpy.zeros((len(states), n_tokens), + dtype='i', order='C') + c_ids = ids.data + for i, state in enumerate(states): + if not state.is_final(): + state.c.set_context_tokens(c_ids, n_tokens) + c_ids += ids.shape[1] + return ids + + +def update_beam(TransitionSystem moves, int nr_feature, + docs, tokvecs, golds, + state2vec, vec2scores, drop=0., sgd=None, + losses=None, int width=4, float density=0.001): + pbeam = ParserBeam(moves, docs, golds, + width=width, density=density) + gbeam = ParserBeam(moves, docs, golds, + width=width, density=density) + beam_map = {} + backprops = [] + violns = [MaxViolation() for _ in range(len(docs))] + example_ids = list(range(len(docs))) + while not pbeam.is_done and not gbeam.is_done: + states, p_indices, g_indices = get_states(example_ids, pbeam, gbeam, beam_map) + + token_ids = get_token_ids(states, nr_feature) + vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) + scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) + + backprops.append((token_ids, bp_vectors, bp_scores)) + + p_scores = [scores[indices] for indices in p_indices] + g_scores = [scores[indices] for indices in g_indices] + pbeam.advance(p_scores) + gbeam.advance(g_scores, follow_gold=True) + + for i, violn in enumerate(violns): + violn.check_crf(pbeam[i], gbeam[i]) + + histories = [(v.p_hist + v.g_hist) for v in violns] + losses = [(v.p_probs + v.g_probs) for v in violns] + states_d_scores = get_gradient(moves.n_moves, beam_map, + histories, losses) + return states_d_scores, backprops + + +def get_states(example_ids, pbeams, gbeams, beam_map): + states = [] + seen = {} + p_indices = [] + g_indices = [] + cdef Beam pbeam, gbeam + for eg_id, pbeam, gbeam in zip(example_ids, pbeams, gbeams): + p_indices.append([]) + for j in range(pbeam.size): + key = tuple([eg_id] + pbeam.histories[j]) + seen[key] = len(states) + p_indices[-1].append(len(states)) + states.append(pbeam.at(j)) + beam_map.update(seen) + g_indices.append([]) + for i in range(gbeam.size): + key = tuple([eg_id] + gbeam.histories[i]) + if key in seen: + g_indices[-1].append(seen[key]) + else: + g_indices[-1].append(len(states)) + beam_map[key] = len(states) + states.append(gbeam.at(i)) + + p_indices = numpy.asarray(p_indices, dtype='i') + g_indices = numpy.asarray(g_indices, dtype='i') + return states, p_indices, g_indices + + +def get_gradient(nr_class, beam_map, histories, losses): + """ + The global model assigns a loss to each parse. The beam scores + are additive, so the same gradient is applied to each action + in the history. This gives the gradient of a single *action* + for a beam state -- so we have "the gradient of loss for taking + action i given history H." + """ + nr_step = max(len(hist) for hist in histories) + nr_beam = len(histories) + grads = [numpy.zeros((nr_beam, nr_class), dtype='f') for _ in range(nr_step)] + for hist, loss in zip(histories, losses): + key = tuple() + for j, clas in enumerate(hist): + grads[j][i, clas] = loss + key = key + clas + i = beam_map[key] + return grads + + diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index bd56ba40b..11584e4d2 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -63,6 +63,7 @@ from ..tokens.doc cimport Doc from ..strings cimport StringStore from ..gold cimport GoldParse from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG +from . import _beam_utils USE_FINE_TUNE = True @@ -256,7 +257,7 @@ cdef class Parser: with Model.use_device('cpu'): upper = chain( - clone(drop_layer(Residual(Maxout(hidden_width))), (depth-1)), + clone(Residual(ReLu(hidden_width)), (depth-1)), zero_init(Affine(nr_class, drop_factor=0.0)) ) # TODO: This is an unfortunate hack atm! @@ -526,6 +527,30 @@ cdef class Parser: bp_my_tokvecs(d_tokvecs, sgd=sgd) return d_tokvecs + def update_beam(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): + docs, tokvecs = docs_tokvecs + tokvecs = self.model[0].ops.flatten(tokvecs) + + cuda_stream = get_cuda_stream() + state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, cuda_stream, 0.0) + + states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, + docs, tokvecs, golds, + state2vec, vec2scores, + drop, sgd, losses) + backprop_lower = [] + for i, d_scores in enumerate(states_d_scores): + ids, bp_vectors, bp_scores = backprops[i] + d_vector = bp_scores(d_scores, sgd=sgd) + backprop_lower.append(( + get_async(cuda_stream, ids), + get_async(cuda_stream, d_vector), + bp_vectors)) + d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) + self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) + lengths = [len(doc) for doc in docs] + return self.model[0].ops.unflatten(d_tokvecs, lengths) + def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long doc will get multiple states. Let's say we have a doc of length 2*N, diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py new file mode 100644 index 000000000..ad0dfa7a1 --- /dev/null +++ b/spacy/tests/parser/test_nn_beam.py @@ -0,0 +1,98 @@ +from __future__ import unicode_literals +import pytest +import numpy +from thinc.api import layerize + +from ...vocab import Vocab +from ...syntax.arc_eager import ArcEager +from ...tokens import Doc +from ...gold import GoldParse +from ...syntax._beam_utils import ParserBeam, update_beam + + +@pytest.fixture +def vocab(): + return Vocab() + +@pytest.fixture +def moves(vocab): + aeager = ArcEager(vocab.strings, {}) + aeager.add_action(2, 'nsubj') + aeager.add_action(3, 'dobj') + aeager.add_action(2, 'aux') + return aeager + + +@pytest.fixture +def docs(vocab): + return [Doc(vocab, words=['Rats', 'bite', 'things'])] + +@pytest.fixture +def tokvecs(docs, vector_size): + output = [] + for doc in docs: + vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size)) + output.append(numpy.asarray(vec)) + return output + + +@pytest.fixture +def golds(docs): + return [GoldParse(doc) for doc in docs] + + +@pytest.fixture +def batch_size(docs): + return len(docs) + + +@pytest.fixture +def beam_width(): + return 4 + + +@pytest.fixture +def vector_size(): + return 6 + + +@pytest.fixture +def beam(moves, docs, golds, beam_width): + return ParserBeam(moves, docs, golds, width=beam_width) + +@pytest.fixture +def scores(moves, batch_size, beam_width): + return [ + numpy.asarray( + numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), + dtype='f') + for _ in range(batch_size)] + + +def test_create_beam(beam): + pass + + +def test_beam_advance(beam, scores): + beam.advance(scores) + + +def test_beam_advance_too_few_scores(beam, scores): + with pytest.raises(IndexError): + beam.advance(scores[:-1]) + + +def test_update_beam(moves, docs, tokvecs, golds, vector_size): + @layerize + def state2vec(X, drop=0.): + vec = numpy.ones((X.shape[0], vector_size), dtype='f') + return vec, None + @layerize + def vec2scores(X, drop=0.): + scores = numpy.ones((X.shape[0], moves.n_moves), dtype='f') + return scores, None + d_loss, backprops = update_beam(moves, 13, docs, tokvecs, golds, + state2vec, vec2scores, drop=0.0, sgd=None, + losses={}, width=4, density=0.001) + + From d4308d236356e483c60f0119549f5a4da12fe1cc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 17:14:39 -0500 Subject: [PATCH 36/49] Initialize State offset to 0 --- spacy/syntax/_state.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index c06851978..9aeeba441 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -37,6 +37,7 @@ cdef cppclass StateC: this.shifted = calloc(length + (PADDING * 2), sizeof(bint)) this._sent = calloc(length + (PADDING * 2), sizeof(TokenC)) this._ents = calloc(length + (PADDING * 2), sizeof(Entity)) + this.offset = 0 cdef int i for i in range(length + (PADDING * 2)): this._ents[i].end = -1 From 4638f4b869ad18c86c227e71e99c462aabd31eba Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 17:15:16 -0500 Subject: [PATCH 37/49] Fix beam update --- spacy/syntax/_beam_utils.pyx | 76 +++++++++++++++++++++--------------- spacy/syntax/nn_parser.pyx | 20 ++++++---- 2 files changed, 58 insertions(+), 38 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 4a4b79dad..10b5e407c 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -41,21 +41,24 @@ cdef hash_t _hash_state(void* _state, void* _) except 0: cdef class ParserBeam(object): cdef public TransitionSystem moves - cdef public object docs + cdef public object states cdef public object golds cdef public object beams - def __init__(self, TransitionSystem moves, docs, golds, + def __init__(self, TransitionSystem moves, states, golds, int width=4, float density=0.001): self.moves = moves - self.docs = docs + self.states = states self.golds = golds self.beams = [] - cdef Doc doc cdef Beam beam - for doc in docs: + cdef StateClass state, st + for state in states: beam = Beam(self.moves.n_moves, width, density) - beam.initialize(self.moves.init_beam_state, doc.length, doc.c) + beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent) + for i in range(beam.size): + st = beam.at(i) + st.c.offset = state.c.offset self.beams.append(beam) @property @@ -100,34 +103,38 @@ cdef class ParserBeam(object): def get_token_ids(states, int n_tokens): cdef StateClass state cdef np.ndarray ids = numpy.zeros((len(states), n_tokens), - dtype='i', order='C') + dtype='int32', order='C') c_ids = ids.data for i, state in enumerate(states): if not state.is_final(): state.c.set_context_tokens(c_ids, n_tokens) + else: + ids[i] = -1 c_ids += ids.shape[1] return ids -def update_beam(TransitionSystem moves, int nr_feature, - docs, tokvecs, golds, +def update_beam(TransitionSystem moves, int nr_feature, int max_steps, + states, tokvecs, golds, state2vec, vec2scores, drop=0., sgd=None, losses=None, int width=4, float density=0.001): - pbeam = ParserBeam(moves, docs, golds, + pbeam = ParserBeam(moves, states, golds, width=width, density=density) - gbeam = ParserBeam(moves, docs, golds, + gbeam = ParserBeam(moves, states, golds, width=width, density=density) - beam_map = {} + beam_maps = [] backprops = [] - violns = [MaxViolation() for _ in range(len(docs))] - example_ids = list(range(len(docs))) - while not pbeam.is_done and not gbeam.is_done: - states, p_indices, g_indices = get_states(example_ids, pbeam, gbeam, beam_map) + violns = [MaxViolation() for _ in range(len(states))] + for t in range(max_steps): + if pbeam.is_done and gbeam.is_done: + break + beam_maps.append({}) + states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1]) token_ids = get_token_ids(states, nr_feature) vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) - + backprops.append((token_ids, bp_vectors, bp_scores)) p_scores = [scores[indices] for indices in p_indices] @@ -140,18 +147,18 @@ def update_beam(TransitionSystem moves, int nr_feature, histories = [(v.p_hist + v.g_hist) for v in violns] losses = [(v.p_probs + v.g_probs) for v in violns] - states_d_scores = get_gradient(moves.n_moves, beam_map, + states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses) return states_d_scores, backprops -def get_states(example_ids, pbeams, gbeams, beam_map): - states = [] +def get_states(pbeams, gbeams, beam_map): seen = {} + states = [] p_indices = [] g_indices = [] cdef Beam pbeam, gbeam - for eg_id, pbeam, gbeam in zip(example_ids, pbeams, gbeams): + for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): p_indices.append([]) for j in range(pbeam.size): key = tuple([eg_id] + pbeam.histories[j]) @@ -174,23 +181,30 @@ def get_states(example_ids, pbeams, gbeams, beam_map): return states, p_indices, g_indices -def get_gradient(nr_class, beam_map, histories, losses): +def get_gradient(nr_class, beam_maps, histories, losses): """ The global model assigns a loss to each parse. The beam scores are additive, so the same gradient is applied to each action in the history. This gives the gradient of a single *action* for a beam state -- so we have "the gradient of loss for taking action i given history H." + + Histories: Each hitory is a list of actions + Each candidate has a history + Each beam has multiple candidates + Each batch has multiple beams + So history is list of lists of lists of ints """ - nr_step = max(len(hist) for hist in histories) - nr_beam = len(histories) - grads = [numpy.zeros((nr_beam, nr_class), dtype='f') for _ in range(nr_step)] - for hist, loss in zip(histories, losses): - key = tuple() - for j, clas in enumerate(hist): - grads[j][i, clas] = loss - key = key + clas - i = beam_map[key] + nr_step = len(beam_maps) + grads = [numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f') + for beam_map in beam_maps] + for eg_id, hists in enumerate(histories): + for loss, hist in zip(losses[eg_id], hists): + key = tuple([eg_id]) + for j, clas in enumerate(hist): + i = beam_maps[j][key] + grads[j][i, clas] = loss + key = key + tuple([clas]) return grads diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 11584e4d2..c842ef00b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -529,23 +529,29 @@ cdef class Parser: def update_beam(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): docs, tokvecs = docs_tokvecs + lengths = [len(d) for d in docs] tokvecs = self.model[0].ops.flatten(tokvecs) + states, golds, max_moves = self._init_gold_batch(docs, golds) cuda_stream = get_cuda_stream() - state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, cuda_stream, 0.0) + state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) - states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, - docs, tokvecs, golds, + states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, max_moves, + states, tokvecs, golds, state2vec, vec2scores, drop, sgd, losses) backprop_lower = [] for i, d_scores in enumerate(states_d_scores): ids, bp_vectors, bp_scores = backprops[i] d_vector = bp_scores(d_scores, sgd=sgd) - backprop_lower.append(( - get_async(cuda_stream, ids), - get_async(cuda_stream, d_vector), - bp_vectors)) + if isinstance(self.model[0].ops, CupyOps) \ + and not isinstance(ids, state2vec.ops.xp.ndarray): + backprop_lower.append(( + get_async(cuda_stream, ids), + get_async(cuda_stream, d_vector), + bp_vectors)) + else: + backprop_lower.append((ids, d_vector, bp_vectors)) d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) lengths = [len(doc) for doc in docs] From 24b45b45c6bbbb42443d4eb91ec39062a22039d0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 17:15:28 -0500 Subject: [PATCH 38/49] Add test for beam update --- spacy/tests/parser/test_neural_parser.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 42b55745f..30a6367c8 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -78,3 +78,16 @@ def test_predict_doc_beam(parser, tok2vec, model, doc): parser(doc, beam_width=32, beam_density=0.001) for word in doc: print(word.text, word.head, word.dep_) + + +def test_update_doc_beam(parser, tok2vec, model, doc, gold): + parser.model = model + tokvecs, bp_tokvecs = tok2vec.begin_update([doc]) + d_tokvecs = parser.update_beam(([doc], tokvecs), [gold]) + assert d_tokvecs[0].shape == tokvecs[0].shape + def optimize(weights, gradient, key=None): + weights -= 0.001 * gradient + bp_tokvecs(d_tokvecs, sgd=optimize) + assert d_tokvecs[0].sum() == 0. + + From c96d76983626ed1edcd4f513318469b7b7e6a191 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 18:21:54 -0500 Subject: [PATCH 39/49] Fix beam parse. Not sure if working --- spacy/syntax/_beam_utils.pyx | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 10b5e407c..3fcd322e2 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -87,8 +87,9 @@ cdef class ParserBeam(object): def _set_scores(self, Beam beam, scores): for i in range(beam.size): state = beam.at(i) - for j in range(beam.nr_class): - beam.scores[i][j] = scores[i, j] + if not state.is_final(): + for j in range(beam.nr_class): + beam.scores[i][j] = scores[i, j] self.moves.set_valid(beam.is_valid[i], state.c) def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False): @@ -137,8 +138,8 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, backprops.append((token_ids, bp_vectors, bp_scores)) - p_scores = [scores[indices] for indices in p_indices] - g_scores = [scores[indices] for indices in g_indices] + p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices] + g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices] pbeam.advance(p_scores) gbeam.advance(g_scores, follow_gold=True) @@ -176,8 +177,8 @@ def get_states(pbeams, gbeams, beam_map): beam_map[key] = len(states) states.append(gbeam.at(i)) - p_indices = numpy.asarray(p_indices, dtype='i') - g_indices = numpy.asarray(g_indices, dtype='i') + p_indices = [numpy.asarray(idx, dtype='i') for idx in p_indices] + g_indices = [numpy.asarray(idx, dtype='i') for idx in g_indices] return states, p_indices, g_indices @@ -203,7 +204,9 @@ def get_gradient(nr_class, beam_maps, histories, losses): key = tuple([eg_id]) for j, clas in enumerate(hist): i = beam_maps[j][key] - grads[j][i, clas] = loss + # In step j, at state i action clas + # resulted in loss + grads[j][i, clas] += loss key = key + tuple([clas]) return grads From 28e930aae096407cf5dcb0cfda54bcbd881a551c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 19:22:52 -0500 Subject: [PATCH 40/49] Fixes for beam parsing. Not working --- spacy/syntax/_beam_utils.pyx | 41 ++++++++++++++++++++++-------------- spacy/syntax/nn_parser.pyx | 27 +++++++++++++++++++----- 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 3fcd322e2..af4aff9fe 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -60,10 +60,16 @@ cdef class ParserBeam(object): st = beam.at(i) st.c.offset = state.c.offset self.beams.append(beam) + + def __dealloc__(self): + if self.beams is not None: + for beam in self.beams: + if beam is not None: + _cleanup(beam) @property def is_done(self): - return all(beam.is_done for beam in self.beams) + return all(b.is_done for b in self.beams) def __getitem__(self, i): return self.beams[i] @@ -77,28 +83,31 @@ cdef class ParserBeam(object): self._set_scores(beam, scores[i]) if self.golds is not None: self._set_costs(beam, self.golds[i], follow_gold=follow_gold) - if follow_gold: - assert self.golds is not None - beam.advance(_transition_state, NULL, self.moves.c) - else: - beam.advance(_transition_state, _hash_state, self.moves.c) - beam.check_done(_check_final_state, NULL) + if follow_gold: + assert self.golds is not None + beam.advance(_transition_state, NULL, self.moves.c) + else: + beam.advance(_transition_state, _hash_state, self.moves.c) + beam.check_done(_check_final_state, NULL) - def _set_scores(self, Beam beam, scores): + def _set_scores(self, Beam beam, float[:, ::1] scores): + cdef float* c_scores = &scores[0, 0] for i in range(beam.size): state = beam.at(i) if not state.is_final(): for j in range(beam.nr_class): - beam.scores[i][j] = scores[i, j] - self.moves.set_valid(beam.is_valid[i], state.c) + beam.scores[i][j] = c_scores[i * beam.nr_class + j] + self.moves.set_valid(beam.is_valid[i], state.c) def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False): for i in range(beam.size): state = beam.at(i) - self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) - if follow_gold: - for j in range(beam.nr_class): - beam.is_valid[i][j] *= beam.costs[i][j] <= 0 + if not state.c.is_final(): + self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) + if follow_gold: + for j in range(beam.nr_class): + if beam.costs[i][j] >= 1: + beam.is_valid[i][j] = 0 def get_token_ids(states, int n_tokens): @@ -122,7 +131,7 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, pbeam = ParserBeam(moves, states, golds, width=width, density=density) gbeam = ParserBeam(moves, states, golds, - width=width, density=density) + width=width, density=0.0) beam_maps = [] backprops = [] violns = [MaxViolation() for _ in range(len(states))] @@ -145,7 +154,7 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, for i, violn in enumerate(violns): violn.check_crf(pbeam[i], gbeam[i]) - + histories = [(v.p_hist + v.g_hist) for v in violns] losses = [(v.p_probs + v.g_probs) for v in violns] states_d_scores = get_gradient(moves.n_moves, beam_maps, diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index c842ef00b..fa954a879 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -66,6 +66,7 @@ from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG from . import _beam_utils USE_FINE_TUNE = True +BEAM_PARSE = True def get_templates(*args, **kwargs): return [] @@ -335,7 +336,7 @@ cdef class Parser: return output def pipe(self, docs, int batch_size=1000, int n_threads=2, - beam_width=1, beam_density=0.001): + beam_width=4, beam_density=0.001): """ Process a stream of documents. @@ -348,14 +349,18 @@ cdef class Parser: Yields (Doc): Documents, in order. """ cdef Doc doc + cdef Beam beam for docs in cytoolz.partition_all(batch_size, docs): docs = list(docs) tokvecs = [doc.tensor for doc in docs] if beam_width == 1: parse_states = self.parse_batch(docs, tokvecs) else: - parse_states = self.beam_parse(docs, tokvecs, - beam_width=beam_width, beam_density=beam_density) + beams = self.beam_parse(docs, tokvecs, + beam_width=beam_width, beam_density=beam_density) + parse_states = [] + for beam in beams: + parse_states.append(beam.at(0)) self.set_annotations(docs, parse_states) yield from docs @@ -462,6 +467,9 @@ cdef class Parser: return beams def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): + if BEAM_PARSE: + return self.update_beam(docs_tokvecs, golds, drop=drop, sgd=sgd, + losses=losses) if losses is not None and self.name not in losses: losses[self.name] = 0. docs, tokvec_lists = docs_tokvecs @@ -528,9 +536,16 @@ cdef class Parser: return d_tokvecs def update_beam(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): + if losses is not None and self.name not in losses: + losses[self.name] = 0. docs, tokvecs = docs_tokvecs lengths = [len(d) for d in docs] tokvecs = self.model[0].ops.flatten(tokvecs) + if USE_FINE_TUNE: + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) + my_tokvecs = self.model[0].ops.flatten(my_tokvecs) + tokvecs += my_tokvecs + states, golds, max_moves = self._init_gold_batch(docs, golds) cuda_stream = get_cuda_stream() @@ -554,8 +569,10 @@ cdef class Parser: backprop_lower.append((ids, d_vector, bp_vectors)) d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) - lengths = [len(doc) for doc in docs] - return self.model[0].ops.unflatten(d_tokvecs, lengths) + d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths) + if USE_FINE_TUNE: + bp_my_tokvecs(d_tokvecs, sgd=sgd) + return d_tokvecs def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long From 3e30712b627ea5c5625f4eeeba125b38722bd67a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 19:24:17 -0500 Subject: [PATCH 41/49] Improve defaults --- spacy/pipeline.pyx | 2 +- spacy/syntax/nn_parser.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index f367d2b5b..634d3e4b5 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -138,7 +138,7 @@ class TokenVectorEncoder(BaseThincComponent): name = 'tensorizer' @classmethod - def Model(cls, width=128, embed_size=7500, **cfg): + def Model(cls, width=128, embed_size=4000, **cfg): """Create a new statistical model for the class. width (int): Output size of the model. diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index fa954a879..8a33a9da1 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -238,12 +238,12 @@ cdef class Parser: Base class of the DependencyParser and EntityRecognizer. """ @classmethod - def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg): + def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg): depth = util.env_opt('parser_hidden_depth', depth) token_vector_width = util.env_opt('token_vector_width', token_vector_width) hidden_width = util.env_opt('hidden_width', hidden_width) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) - embed_size = util.env_opt('embed_size', 7500) + embed_size = util.env_opt('embed_size', 4000) tensors = fine_tune(Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())) if parser_maxout_pieces == 1: From 17874fe4918eeef757bae153342468e166ba9c96 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Aug 2017 19:35:40 -0500 Subject: [PATCH 42/49] Disable beam parsing --- spacy/syntax/nn_parser.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 8a33a9da1..ea61af1df 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -66,7 +66,7 @@ from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG from . import _beam_utils USE_FINE_TUNE = True -BEAM_PARSE = True +BEAM_PARSE = False def get_templates(*args, **kwargs): return [] @@ -336,7 +336,7 @@ cdef class Parser: return output def pipe(self, docs, int batch_size=1000, int n_threads=2, - beam_width=4, beam_density=0.001): + beam_width=1, beam_density=0.001): """ Process a stream of documents. From 92ebab6073f29fd919306e9f5775e8f8842692f8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 13 Aug 2017 08:56:02 +0200 Subject: [PATCH 43/49] Update beam-update tests --- spacy/tests/parser/test_nn_beam.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index ad0dfa7a1..45c85d969 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -8,6 +8,7 @@ from ...syntax.arc_eager import ArcEager from ...tokens import Doc from ...gold import GoldParse from ...syntax._beam_utils import ParserBeam, update_beam +from ...syntax.stateclass import StateClass @pytest.fixture @@ -27,6 +28,10 @@ def moves(vocab): def docs(vocab): return [Doc(vocab, words=['Rats', 'bite', 'things'])] +@pytest.fixture +def states(docs): + return [StateClass(doc) for doc in docs] + @pytest.fixture def tokvecs(docs, vector_size): output = [] @@ -57,8 +62,8 @@ def vector_size(): @pytest.fixture -def beam(moves, docs, golds, beam_width): - return ParserBeam(moves, docs, golds, width=beam_width) +def beam(moves, states, golds, beam_width): + return ParserBeam(moves, states, golds, width=beam_width) @pytest.fixture def scores(moves, batch_size, beam_width): @@ -80,19 +85,3 @@ def test_beam_advance(beam, scores): def test_beam_advance_too_few_scores(beam, scores): with pytest.raises(IndexError): beam.advance(scores[:-1]) - - -def test_update_beam(moves, docs, tokvecs, golds, vector_size): - @layerize - def state2vec(X, drop=0.): - vec = numpy.ones((X.shape[0], vector_size), dtype='f') - return vec, None - @layerize - def vec2scores(X, drop=0.): - scores = numpy.ones((X.shape[0], moves.n_moves), dtype='f') - return scores, None - d_loss, backprops = update_beam(moves, 13, docs, tokvecs, golds, - state2vec, vec2scores, drop=0.0, sgd=None, - losses={}, width=4, density=0.001) - - From 4ae0d5e1e63903613ed24aa7fea0fe5593c30fe8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 13 Aug 2017 09:03:38 +0200 Subject: [PATCH 44/49] Set defaults for convert command --- spacy/cli/convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index a0a76e5ec..fef6753e6 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -21,10 +21,10 @@ CONVERTERS = { @plac.annotations( input_file=("input file", "positional", None, str), output_dir=("output directory for converted file", "positional", None, str), - n_sents=("Number of sentences per doc", "option", "n", float), + n_sents=("Number of sentences per doc", "option", "n", int), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(cmd, input_file, output_dir, n_sents, morphology): +def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): """ Convert files into JSON format for use with train command and other experiment management functions. From 12de2638137c1c8c9f86b687d6296138e0aaa0ea Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 13 Aug 2017 09:33:39 +0200 Subject: [PATCH 45/49] Bug fixes to beam parsing. Learns small sample --- spacy/syntax/_beam_utils.pyx | 81 +++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index af4aff9fe..0a513531d 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -66,7 +66,7 @@ cdef class ParserBeam(object): for beam in self.beams: if beam is not None: _cleanup(beam) - + @property def is_done(self): return all(b.is_done for b in self.beams) @@ -80,6 +80,8 @@ cdef class ParserBeam(object): def advance(self, scores, follow_gold=False): cdef Beam beam for i, beam in enumerate(self.beams): + if beam.is_done: + continue self._set_scores(beam, scores[i]) if self.golds is not None: self._set_costs(beam, self.golds[i], follow_gold=follow_gold) @@ -108,7 +110,22 @@ cdef class ParserBeam(object): for j in range(beam.nr_class): if beam.costs[i][j] >= 1: beam.is_valid[i][j] = 0 - + + +def is_gold(StateClass state, GoldParse gold, strings): + predicted = set() + truth = set() + for i in range(gold.length): + if gold.cand_to_gold[i] is None: + continue + if state.safe_get(i).dep: + predicted.add((i, state.H(i), strings[state.safe_get(i).dep])) + else: + predicted.add((i, state.H(i), 'ROOT')) + id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] + truth.add((id_, head, dep)) + return truth == predicted + def get_token_ids(states, int n_tokens): cdef StateClass state @@ -123,11 +140,13 @@ def get_token_ids(states, int n_tokens): c_ids += ids.shape[1] return ids - +nr_update = 0 def update_beam(TransitionSystem moves, int nr_feature, int max_steps, states, tokvecs, golds, state2vec, vec2scores, drop=0., sgd=None, losses=None, int width=4, float density=0.001): + global nr_update + nr_update += 1 pbeam = ParserBeam(moves, states, golds, width=width, density=density) gbeam = ParserBeam(moves, states, golds, @@ -139,8 +158,9 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, if pbeam.is_done and gbeam.is_done: break beam_maps.append({}) - states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1]) - + states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update) + if not states: + break token_ids = get_token_ids(states, nr_feature) vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) @@ -154,6 +174,16 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, for i, violn in enumerate(violns): violn.check_crf(pbeam[i], gbeam[i]) + # The non-monotonic oracle makes it difficult to ensure final costs are + # correct. Therefore do final correction + cdef Beam pred + for i, (pred, gold_parse) in enumerate(zip(pbeam, golds)): + for j in range(pred.size): + if is_gold(pred.at(j), gold_parse, moves.strings): + pred._states[j].loss = 0.0 + elif pred._states[j].loss == 0.0: + pred._states[j].loss = 1.0 + violn.check_crf(pred, gbeam[i]) histories = [(v.p_hist + v.g_hist) for v in violns] losses = [(v.p_probs + v.g_probs) for v in violns] @@ -162,30 +192,35 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, return states_d_scores, backprops -def get_states(pbeams, gbeams, beam_map): +def get_states(pbeams, gbeams, beam_map, nr_update): seen = {} states = [] p_indices = [] g_indices = [] cdef Beam pbeam, gbeam for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): + if pbeam.loss > 0 and pbeam.min_score > (gbeam.score + nr_update): + continue p_indices.append([]) for j in range(pbeam.size): - key = tuple([eg_id] + pbeam.histories[j]) - seen[key] = len(states) - p_indices[-1].append(len(states)) - states.append(pbeam.at(j)) + state = pbeam.at(j) + if not state.is_final(): + key = tuple([eg_id] + pbeam.histories[j]) + seen[key] = len(states) + p_indices[-1].append(len(states)) + states.append(pbeam.at(j)) beam_map.update(seen) g_indices.append([]) for i in range(gbeam.size): - key = tuple([eg_id] + gbeam.histories[i]) - if key in seen: - g_indices[-1].append(seen[key]) - else: - g_indices[-1].append(len(states)) - beam_map[key] = len(states) - states.append(gbeam.at(i)) - + state = gbeam.at(j) + if not state.is_final(): + key = tuple([eg_id] + gbeam.histories[i]) + if key in seen: + g_indices[-1].append(seen[key]) + else: + g_indices[-1].append(len(states)) + beam_map[key] = len(states) + states.append(gbeam.at(i)) p_indices = [numpy.asarray(idx, dtype='i') for idx in p_indices] g_indices = [numpy.asarray(idx, dtype='i') for idx in g_indices] return states, p_indices, g_indices @@ -206,12 +241,18 @@ def get_gradient(nr_class, beam_maps, histories, losses): So history is list of lists of lists of ints """ nr_step = len(beam_maps) - grads = [numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f') - for beam_map in beam_maps] + grads = [] + for beam_map in beam_maps: + if beam_map: + grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f')) + else: + grads.append(None) for eg_id, hists in enumerate(histories): for loss, hist in zip(losses[eg_id], hists): key = tuple([eg_id]) for j, clas in enumerate(hist): + if grads[j] is None: + continue i = beam_maps[j][key] # In step j, at state i action clas # resulted in loss From 4363b4aa4a757807831d89e1f1b10bc46e8bc69a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 13 Aug 2017 12:36:55 +0200 Subject: [PATCH 46/49] Fix redundant tokvecs updates during update --- spacy/language.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 6d97f41fe..cb679a2bc 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -305,14 +305,17 @@ class Language(object): grads[key] = (W, dW) pipes = list(self.pipeline[1:]) random.shuffle(pipes) + tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) + all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses] for proc in pipes: if not hasattr(proc, 'update'): continue - tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) d_tokvecses = proc.update((docs, tokvecses), golds, drop=drop, sgd=get_grads, losses=losses) if update_tensors and d_tokvecses is not None: - bp_tokvecses(d_tokvecses, sgd=sgd) + for i, d_tv in enumerate(d_tokvecses): + all_d_tokvecses[i] += d_tv + bp_tokvecses(all_d_tokvecses, sgd=sgd) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. From 6a42cc16ff673c738e29aa515c1623dde4cf9566 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 13 Aug 2017 12:37:26 +0200 Subject: [PATCH 47/49] Fix beam parser, improve efficiency of non-beam --- spacy/syntax/_beam_utils.pyx | 39 ++++++++++++------------------------ spacy/syntax/beam_parser.pyx | 14 +------------ spacy/syntax/nn_parser.pyx | 38 +++++++++++++++++++++++------------ 3 files changed, 39 insertions(+), 52 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 0a513531d..6df8d472f 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True +# cython: profile=True cimport numpy as np import numpy from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF @@ -155,8 +156,6 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, backprops = [] violns = [MaxViolation() for _ in range(len(states))] for t in range(max_steps): - if pbeam.is_done and gbeam.is_done: - break beam_maps.append({}) states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update) if not states: @@ -174,16 +173,6 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, for i, violn in enumerate(violns): violn.check_crf(pbeam[i], gbeam[i]) - # The non-monotonic oracle makes it difficult to ensure final costs are - # correct. Therefore do final correction - cdef Beam pred - for i, (pred, gold_parse) in enumerate(zip(pbeam, golds)): - for j in range(pred.size): - if is_gold(pred.at(j), gold_parse, moves.strings): - pred._states[j].loss = 0.0 - elif pred._states[j].loss == 0.0: - pred._states[j].loss = 1.0 - violn.check_crf(pred, gbeam[i]) histories = [(v.p_hist + v.g_hist) for v in violns] losses = [(v.p_probs + v.g_probs) for v in violns] @@ -199,20 +188,18 @@ def get_states(pbeams, gbeams, beam_map, nr_update): g_indices = [] cdef Beam pbeam, gbeam for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): - if pbeam.loss > 0 and pbeam.min_score > (gbeam.score + nr_update): - continue p_indices.append([]) - for j in range(pbeam.size): - state = pbeam.at(j) + for i in range(pbeam.size): + state = pbeam.at(i) if not state.is_final(): - key = tuple([eg_id] + pbeam.histories[j]) + key = tuple([eg_id] + pbeam.histories[i]) seen[key] = len(states) p_indices[-1].append(len(states)) - states.append(pbeam.at(j)) + states.append(pbeam.at(i)) beam_map.update(seen) g_indices.append([]) for i in range(gbeam.size): - state = gbeam.at(j) + state = gbeam.at(i) if not state.is_final(): key = tuple([eg_id] + gbeam.histories[i]) if key in seen: @@ -243,17 +230,17 @@ def get_gradient(nr_class, beam_maps, histories, losses): nr_step = len(beam_maps) grads = [] for beam_map in beam_maps: - if beam_map: - grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f')) - else: - grads.append(None) + grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f')) + assert len(histories) == len(losses) for eg_id, hists in enumerate(histories): for loss, hist in zip(losses[eg_id], hists): key = tuple([eg_id]) for j, clas in enumerate(hist): - if grads[j] is None: - continue - i = beam_maps[j][key] + try: + i = beam_maps[j][key] + except: + print(sorted(beam_maps[j].items())) + raise # In step j, at state i action clas # resulted in loss grads[j][i, clas] += loss diff --git a/spacy/syntax/beam_parser.pyx b/spacy/syntax/beam_parser.pyx index e96e28fcf..f4f66f9fb 100644 --- a/spacy/syntax/beam_parser.pyx +++ b/spacy/syntax/beam_parser.pyx @@ -34,6 +34,7 @@ from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context from .stateclass cimport StateClass from .parser cimport Parser +from ._beam_utils import is_gold DEBUG = False @@ -237,16 +238,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio raise Exception("Gold parse is not gold-standard") -def is_gold(StateClass state, GoldParse gold, StringStore strings): - predicted = set() - truth = set() - for i in range(gold.length): - if gold.cand_to_gold[i] is None: - continue - if state.safe_get(i).dep: - predicted.add((i, state.H(i), strings[state.safe_get(i).dep])) - else: - predicted.add((i, state.H(i), 'ROOT')) - id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] - truth.add((id_, head, dep)) - return truth == predicted diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index ea61af1df..51fd61cc1 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -66,7 +66,7 @@ from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG from . import _beam_utils USE_FINE_TUNE = True -BEAM_PARSE = False +BEAM_PARSE = True def get_templates(*args, **kwargs): return [] @@ -348,6 +348,8 @@ cdef class Parser: The number of threads with which to work on the buffer in parallel. Yields (Doc): Documents, in order. """ + if BEAM_PARSE: + beam_width = 8 cdef Doc doc cdef Beam beam for docs in cytoolz.partition_all(batch_size, docs): @@ -439,6 +441,8 @@ cdef class Parser: cuda_stream, 0.0) beams = [] cdef int offset = 0 + cdef int j = 0 + cdef int k for doc in docs: beam = Beam(nr_class, beam_width, min_density=beam_density) beam.initialize(self.moves.init_beam_state, doc.length, doc.c) @@ -451,16 +455,22 @@ cdef class Parser: states = [] for i in range(beam.size): stcls = beam.at(i) - states.append(stcls) + # This way we avoid having to score finalized states + # We do have to take care to keep indexes aligned, though + if not stcls.is_final(): + states.append(stcls) token_ids = self.get_token_ids(states) vectors = state2vec(token_ids) scores = vec2scores(vectors) + j = 0 + c_scores = scores.data for i in range(beam.size): stcls = beam.at(i) if not stcls.is_final(): self.moves.set_valid(beam.is_valid[i], stcls.c) - for j in range(nr_class): - beam.scores[i][j] = scores[i, j] + for k in range(nr_class): + beam.scores[i][k] = c_scores[j * scores.shape[1] + k] + j += 1 beam.advance(_transition_state, _hash_state, self.moves.c) beam.check_done(_check_final_state, NULL) beams.append(beam) @@ -540,6 +550,7 @@ cdef class Parser: losses[self.name] = 0. docs, tokvecs = docs_tokvecs lengths = [len(d) for d in docs] + assert min(lengths) >= 1 tokvecs = self.model[0].ops.flatten(tokvecs) if USE_FINE_TUNE: my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) @@ -554,9 +565,14 @@ cdef class Parser: states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, max_moves, states, tokvecs, golds, state2vec, vec2scores, - drop, sgd, losses) + drop, sgd, losses, + width=8) backprop_lower = [] for i, d_scores in enumerate(states_d_scores): + if d_scores is None: + continue + if losses is not None: + losses[self.name] += (d_scores**2).sum() ids, bp_vectors, bp_scores = backprops[i] d_vector = bp_scores(d_scores, sgd=sgd) if isinstance(self.model[0].ops, CupyOps) \ @@ -617,14 +633,10 @@ cdef class Parser: xp = get_array_module(d_tokvecs) for ids, d_vector, bp_vector in backprops: d_state_features = bp_vector(d_vector, sgd=sgd) - active_feats = ids * (ids >= 0) - active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1)) - if hasattr(xp, 'scatter_add'): - xp.scatter_add(d_tokvecs, - ids, d_state_features * active_feats) - else: - xp.add.at(d_tokvecs, - ids, d_state_features * active_feats) + mask = ids >= 0 + indices = xp.nonzero(mask) + self.model[0].ops.scatter_add(d_tokvecs, ids[indices], + d_state_features[indices]) @property def move_names(self): From 0ae045256df6f735b0a301914d87b5c26e2520d5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 13 Aug 2017 18:02:05 -0500 Subject: [PATCH 48/49] Fix beam training --- spacy/syntax/_beam_utils.pyx | 59 +++++++++++++++++++++++++----------- spacy/syntax/nn_parser.pyx | 8 ++--- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 6df8d472f..e77036e55 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -57,7 +57,7 @@ cdef class ParserBeam(object): for state in states: beam = Beam(self.moves.n_moves, width, density) beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent) - for i in range(beam.size): + for i in range(beam.width): st = beam.at(i) st.c.offset = state.c.offset self.beams.append(beam) @@ -81,7 +81,7 @@ cdef class ParserBeam(object): def advance(self, scores, follow_gold=False): cdef Beam beam for i, beam in enumerate(self.beams): - if beam.is_done: + if beam.is_done or not scores[i].size: continue self._set_scores(beam, scores[i]) if self.golds is not None: @@ -92,6 +92,12 @@ cdef class ParserBeam(object): else: beam.advance(_transition_state, _hash_state, self.moves.c) beam.check_done(_check_final_state, NULL) + if beam.is_done: + for j in range(beam.size): + if is_gold(beam.at(j), self.golds[i], self.moves.strings): + beam._states[j].loss = 0.0 + elif beam._states[j].loss == 0.0: + beam._states[j].loss = 1.0 def _set_scores(self, Beam beam, float[:, ::1] scores): cdef float* c_scores = &scores[0, 0] @@ -152,32 +158,49 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, width=width, density=density) gbeam = ParserBeam(moves, states, golds, width=width, density=0.0) + cdef StateClass state beam_maps = [] backprops = [] violns = [MaxViolation() for _ in range(len(states))] for t in range(max_steps): + # The beam maps let us find the right row in the flattened scores + # arrays for each state. States are identified by (example id, history). + # We keep a different beam map for each step (since we'll have a flat + # scores array for each step). The beam map will let us take the per-state + # losses, and compute the gradient for each (step, state, class). beam_maps.append({}) + # Gather all states from the two beams in a list. Some stats may occur + # in both beams. To figure out which beam each state belonged to, + # we keep two lists of indices, p_indices and g_indices states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update) if not states: break + # Now that we have our flat list of states, feed them through the model token_ids = get_token_ids(states, nr_feature) vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) + # Store the callbacks for the backward pass backprops.append((token_ids, bp_vectors, bp_scores)) + # Unpack the flat scores into lists for the two beams. The indices arrays + # tell us which example and state the scores-row refers to. p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices] g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices] + # Now advance the states in the beams. The gold beam is contrained to + # to follow only gold analyses. pbeam.advance(p_scores) gbeam.advance(g_scores, follow_gold=True) - + # Track the "maximum violation", to use in the update. for i, violn in enumerate(violns): violn.check_crf(pbeam[i], gbeam[i]) - histories = [(v.p_hist + v.g_hist) for v in violns] - losses = [(v.p_probs + v.g_probs) for v in violns] + # Only make updates if we have non-gold states + histories = [((v.p_hist + v.g_hist) if v.p_hist else []) for v in violns] + losses = [((v.p_probs + v.g_probs) if v.p_probs else []) for v in violns] states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses) + assert len(states_d_scores) == len(backprops), (len(states_d_scores), len(backprops)) return states_d_scores, backprops @@ -187,17 +210,20 @@ def get_states(pbeams, gbeams, beam_map, nr_update): p_indices = [] g_indices = [] cdef Beam pbeam, gbeam + assert len(pbeams) == len(gbeams) for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): p_indices.append([]) + g_indices.append([]) + if pbeam.loss > 0 and pbeam.min_score > gbeam.score: + continue for i in range(pbeam.size): state = pbeam.at(i) if not state.is_final(): key = tuple([eg_id] + pbeam.histories[i]) seen[key] = len(states) p_indices[-1].append(len(states)) - states.append(pbeam.at(i)) + states.append(state) beam_map.update(seen) - g_indices.append([]) for i in range(gbeam.size): state = gbeam.at(i) if not state.is_final(): @@ -207,10 +233,10 @@ def get_states(pbeams, gbeams, beam_map, nr_update): else: g_indices[-1].append(len(states)) beam_map[key] = len(states) - states.append(gbeam.at(i)) - p_indices = [numpy.asarray(idx, dtype='i') for idx in p_indices] - g_indices = [numpy.asarray(idx, dtype='i') for idx in g_indices] - return states, p_indices, g_indices + states.append(state) + p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices] + g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices] + return states, p_idx, g_idx def get_gradient(nr_class, beam_maps, histories, losses): @@ -230,20 +256,17 @@ def get_gradient(nr_class, beam_maps, histories, losses): nr_step = len(beam_maps) grads = [] for beam_map in beam_maps: - grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f')) + if beam_map: + grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f')) assert len(histories) == len(losses) for eg_id, hists in enumerate(histories): for loss, hist in zip(losses[eg_id], hists): key = tuple([eg_id]) for j, clas in enumerate(hist): - try: - i = beam_maps[j][key] - except: - print(sorted(beam_maps[j].items())) - raise + i = beam_maps[j][key] # In step j, at state i action clas # resulted in loss - grads[j][i, clas] += loss + grads[j][i, clas] += loss / len(histories) key = key + tuple([clas]) return grads diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 51fd61cc1..a193c96a3 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -557,20 +557,20 @@ cdef class Parser: my_tokvecs = self.model[0].ops.flatten(my_tokvecs) tokvecs += my_tokvecs - states, golds, max_moves = self._init_gold_batch(docs, golds) + states = self.moves.init_batch(docs) + for gold in golds: + self.moves.preprocess_gold(gold) cuda_stream = get_cuda_stream() state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) - states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, max_moves, + states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, states, tokvecs, golds, state2vec, vec2scores, drop, sgd, losses, width=8) backprop_lower = [] for i, d_scores in enumerate(states_d_scores): - if d_scores is None: - continue if losses is not None: losses[self.name] += (d_scores**2).sum() ids, bp_vectors, bp_scores = backprops[i] From ac6c25f7629011c9a51692e684b1e1db3422585d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 14 Aug 2017 12:09:18 +0200 Subject: [PATCH 49/49] Check SGD is not None in update --- spacy/_ml.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index e37bcac52..91b530fad 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -381,7 +381,8 @@ def fine_tune(embedding, combine=None): flat_grad = model.ops.flatten(d_output) model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum() model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum() - sgd(model._mem.weights, model._mem.gradient, key=model.id) + if sgd is not None: + sgd(model._mem.weights, model._mem.gradient, key=model.id) return d_output return output, fine_tune_bwd model = wrap(fine_tune_fwd, embedding)