From dac3f1b280e3819961205e3c6c5dc97c48d4ac18 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Nov 2018 10:52:50 +0000 Subject: [PATCH] Improve Tensorizer --- spacy/pipeline.pyx | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 42a0f07e0..b4a11bd2c 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -31,7 +31,7 @@ from .matcher import Matcher from .matcher import Matcher, PhraseMatcher from .tokens.span import Span -from .attrs import POS +from .attrs import POS, ID from .parts_of_speech import X from ._ml import Tok2Vec, build_text_classifier, build_tagger_model from ._ml import link_vectors_to_models, zero_init, flatten @@ -434,7 +434,7 @@ class Tensorizer(Pipe): name = 'tensorizer' @classmethod - def Model(cls, output_size=300, input_size=128, **cfg): + def Model(cls, output_size=300, **cfg): """Create a new statistical model for the class. width (int): Output size of the model. @@ -442,6 +442,7 @@ class Tensorizer(Pipe): **cfg: Config parameters. RETURNS (Model): A `thinc.neural.Model` or similar instance. """ + input_size = util.env_opt('token_vector_width', cfg.get('input_size', 128)) return zero_init(Affine(output_size, input_size)) def __init__(self, vocab, model=True, **cfg): @@ -540,12 +541,8 @@ class Tensorizer(Pipe): return loss def get_loss(self, docs, golds, prediction): - target = [] - i = 0 - for doc in docs: - vectors = self.model.ops.xp.vstack([w.vector for w in doc]) - target.append(vectors) - target = self.model.ops.xp.vstack(target) + ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + target = self.vocab.vectors.data[ids] d_scores = (prediction - target) loss = (d_scores**2).sum() return loss, d_scores