From 2ddd552d52ff51cb41d6ec3065afcb1bc8658ab4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 23 Mar 2019 16:05:28 +0100 Subject: [PATCH] Fix code for bag-of-words feature extraction The _ml.py module had a redundant copy of a function to extract unigram bag-of-words features, except one had a bug that set values to 0. Another function allowed extraction of bigram features. Replace all three with a new function that supports arbitrary ngram sizes and also allows control of which attribute is used (e.g. ORTH, LOWER, etc). --- spacy/_ml.py | 94 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 40 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index a32d2cf20..ad061d79c 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -81,18 +81,6 @@ def _zero_init(model): return model -@layerize -def _preprocess_doc(docs, drop=0.0): - keys = [doc.to_array(LOWER) for doc in docs] - # The dtype here matches what thinc is expecting -- which differs per - # platform (by int definition). This should be fixed once the problem - # is fixed on Thinc's side. - lengths = numpy.array([arr.shape[0] for arr in keys], dtype=numpy.int_) - keys = numpy.concatenate(keys) - vals = numpy.zeros(keys.shape, dtype='f') - return (keys, vals, lengths), None - - def with_cpu(ops, model): """Wrap a model that should run on CPU, transferring inputs and outputs as necessary.""" @@ -133,20 +121,31 @@ def _to_device(ops, X): return ops.asarray(X) -@layerize -def _preprocess_doc_bigrams(docs, drop=0.0): - unigrams = [doc.to_array(LOWER) for doc in docs] - ops = Model.ops - bigrams = [ops.ngrams(2, doc_unis) for doc_unis in unigrams] - keys = [ops.xp.concatenate(feats) for feats in zip(unigrams, bigrams)] - keys, vals = zip(*[ops.xp.unique(k, return_counts=True) for k in keys]) - # The dtype here matches what thinc is expecting -- which differs per - # platform (by int definition). This should be fixed once the problem - # is fixed on Thinc's side. - lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_) - keys = ops.xp.concatenate(keys) - vals = ops.asarray(ops.xp.concatenate(vals), dtype="f") - return (keys, vals, lengths), None +class extract_ngrams(Model): + def __init__(self, ngram_size, attr=LOWER): + Model.__init__(self) + self.ngram_size = ngram_size + self.attr = attr + + def begin_update(self, docs, drop=0.0): + batch_keys = [] + batch_vals = [] + for doc in docs: + unigrams = doc.to_array([self.attr]) + ngrams = [unigrams] + for n in range(2, self.ngram_size + 1): + ngrams.append(self.ops.ngrams(n, unigrams)) + keys = self.ops.xp.concatenate(ngrams) + keys, vals = self.ops.xp.unique(keys, return_counts=True) + batch_keys.append(keys) + batch_vals.append(vals) + # The dtype here matches what thinc is expecting -- which differs per + # platform (by int definition). This should be fixed once the problem + # is fixed on Thinc's side. + lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_) + batch_keys = self.ops.xp.concatenate(batch_keys) + batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f") + return (batch_keys, batch_vals, lengths), None @describe.on_data( @@ -486,16 +485,6 @@ def zero_init(model): return model -@layerize -def preprocess_doc(docs, drop=0.0): - keys = [doc.to_array([LOWER]) for doc in docs] - ops = Model.ops - lengths = ops.asarray([arr.shape[0] for arr in keys]) - keys = ops.xp.concatenate(keys) - vals = ops.allocate(keys.shape[0]) + 1 - return (keys, vals, lengths), None - - def getitem(i): def getitem_fwd(X, drop=0.0): return X[i], None @@ -602,10 +591,8 @@ def build_text_classifier(nr_class, width=64, **cfg): >> zero_init(Affine(nr_class, width, drop_factor=0.0)) ) - linear_model = ( - _preprocess_doc - >> with_cpu(Model.ops, LinearModel(nr_class)) - ) + linear_model = build_bow_text_classifier( + nr_class, ngram_size=cfg.get("ngram_size", 1), no_output_layer=True) if cfg.get('exclusive_classes'): output_layer = Softmax(nr_class, nr_class * 2) else: @@ -623,6 +610,33 @@ def build_text_classifier(nr_class, width=64, **cfg): return model +def build_bow_text_classifier(nr_class, ngram_size=1, exclusive_classes=False, + no_output_layer=False, **cfg): + with Model.define_operators({">>": chain}): + model = ( + extract_ngrams(ngram_size, attr=ORTH) + >> with_cpu(Model.ops, + LinearModel(nr_class) + ) + ) + if not no_output_layer: + model = model >> (cpu_softmax if exclusive_classes else logistic) + model.nO = nr_class + return model + + +@layerize +def cpu_softmax(X, drop=0.): + ops = NumpyOps() + + Y = ops.softmax(X) + + def cpu_softmax_backward(dY, sgd=None): + return dY + + return ops.softmax(X), cpu_softmax_backward + + def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg): """ Build a simple CNN text classifier, given a token-to-vector model as inputs.