mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-02 18:06:46 +03:00
Fix code for bag-of-words feature extraction
The _ml.py module had a redundant copy of a function to extract unigram bag-of-words features, except one had a bug that set values to 0. Another function allowed extraction of bigram features. Replace all three with a new function that supports arbitrary ngram sizes and also allows control of which attribute is used (e.g. ORTH, LOWER, etc).
This commit is contained in:
parent
06bf130890
commit
2ddd552d52
88
spacy/_ml.py
88
spacy/_ml.py
|
@ -81,18 +81,6 @@ def _zero_init(model):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
|
||||||
def _preprocess_doc(docs, drop=0.0):
|
|
||||||
keys = [doc.to_array(LOWER) for doc in docs]
|
|
||||||
# The dtype here matches what thinc is expecting -- which differs per
|
|
||||||
# platform (by int definition). This should be fixed once the problem
|
|
||||||
# is fixed on Thinc's side.
|
|
||||||
lengths = numpy.array([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
|
||||||
keys = numpy.concatenate(keys)
|
|
||||||
vals = numpy.zeros(keys.shape, dtype='f')
|
|
||||||
return (keys, vals, lengths), None
|
|
||||||
|
|
||||||
|
|
||||||
def with_cpu(ops, model):
|
def with_cpu(ops, model):
|
||||||
"""Wrap a model that should run on CPU, transferring inputs and outputs
|
"""Wrap a model that should run on CPU, transferring inputs and outputs
|
||||||
as necessary."""
|
as necessary."""
|
||||||
|
@ -133,20 +121,31 @@ def _to_device(ops, X):
|
||||||
return ops.asarray(X)
|
return ops.asarray(X)
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
class extract_ngrams(Model):
|
||||||
def _preprocess_doc_bigrams(docs, drop=0.0):
|
def __init__(self, ngram_size, attr=LOWER):
|
||||||
unigrams = [doc.to_array(LOWER) for doc in docs]
|
Model.__init__(self)
|
||||||
ops = Model.ops
|
self.ngram_size = ngram_size
|
||||||
bigrams = [ops.ngrams(2, doc_unis) for doc_unis in unigrams]
|
self.attr = attr
|
||||||
keys = [ops.xp.concatenate(feats) for feats in zip(unigrams, bigrams)]
|
|
||||||
keys, vals = zip(*[ops.xp.unique(k, return_counts=True) for k in keys])
|
def begin_update(self, docs, drop=0.0):
|
||||||
|
batch_keys = []
|
||||||
|
batch_vals = []
|
||||||
|
for doc in docs:
|
||||||
|
unigrams = doc.to_array([self.attr])
|
||||||
|
ngrams = [unigrams]
|
||||||
|
for n in range(2, self.ngram_size + 1):
|
||||||
|
ngrams.append(self.ops.ngrams(n, unigrams))
|
||||||
|
keys = self.ops.xp.concatenate(ngrams)
|
||||||
|
keys, vals = self.ops.xp.unique(keys, return_counts=True)
|
||||||
|
batch_keys.append(keys)
|
||||||
|
batch_vals.append(vals)
|
||||||
# The dtype here matches what thinc is expecting -- which differs per
|
# The dtype here matches what thinc is expecting -- which differs per
|
||||||
# platform (by int definition). This should be fixed once the problem
|
# platform (by int definition). This should be fixed once the problem
|
||||||
# is fixed on Thinc's side.
|
# is fixed on Thinc's side.
|
||||||
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
|
||||||
keys = ops.xp.concatenate(keys)
|
batch_keys = self.ops.xp.concatenate(batch_keys)
|
||||||
vals = ops.asarray(ops.xp.concatenate(vals), dtype="f")
|
batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
|
||||||
return (keys, vals, lengths), None
|
return (batch_keys, batch_vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
@describe.on_data(
|
@describe.on_data(
|
||||||
|
@ -486,16 +485,6 @@ def zero_init(model):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
|
||||||
def preprocess_doc(docs, drop=0.0):
|
|
||||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
|
||||||
ops = Model.ops
|
|
||||||
lengths = ops.asarray([arr.shape[0] for arr in keys])
|
|
||||||
keys = ops.xp.concatenate(keys)
|
|
||||||
vals = ops.allocate(keys.shape[0]) + 1
|
|
||||||
return (keys, vals, lengths), None
|
|
||||||
|
|
||||||
|
|
||||||
def getitem(i):
|
def getitem(i):
|
||||||
def getitem_fwd(X, drop=0.0):
|
def getitem_fwd(X, drop=0.0):
|
||||||
return X[i], None
|
return X[i], None
|
||||||
|
@ -602,10 +591,8 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||||
)
|
)
|
||||||
|
|
||||||
linear_model = (
|
linear_model = build_bow_text_classifier(
|
||||||
_preprocess_doc
|
nr_class, ngram_size=cfg.get("ngram_size", 1), no_output_layer=True)
|
||||||
>> with_cpu(Model.ops, LinearModel(nr_class))
|
|
||||||
)
|
|
||||||
if cfg.get('exclusive_classes'):
|
if cfg.get('exclusive_classes'):
|
||||||
output_layer = Softmax(nr_class, nr_class * 2)
|
output_layer = Softmax(nr_class, nr_class * 2)
|
||||||
else:
|
else:
|
||||||
|
@ -623,6 +610,33 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def build_bow_text_classifier(nr_class, ngram_size=1, exclusive_classes=False,
|
||||||
|
no_output_layer=False, **cfg):
|
||||||
|
with Model.define_operators({">>": chain}):
|
||||||
|
model = (
|
||||||
|
extract_ngrams(ngram_size, attr=ORTH)
|
||||||
|
>> with_cpu(Model.ops,
|
||||||
|
LinearModel(nr_class)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if not no_output_layer:
|
||||||
|
model = model >> (cpu_softmax if exclusive_classes else logistic)
|
||||||
|
model.nO = nr_class
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def cpu_softmax(X, drop=0.):
|
||||||
|
ops = NumpyOps()
|
||||||
|
|
||||||
|
Y = ops.softmax(X)
|
||||||
|
|
||||||
|
def cpu_softmax_backward(dY, sgd=None):
|
||||||
|
return dY
|
||||||
|
|
||||||
|
return ops.softmax(X), cpu_softmax_backward
|
||||||
|
|
||||||
|
|
||||||
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
|
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
|
||||||
"""
|
"""
|
||||||
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user