Remove cytoolz usage from spaCy

This commit is contained in:
Matthew Honnibal 2018-12-03 02:19:12 +01:00
parent a7b085ae46
commit 1c71fdb805
3 changed files with 7 additions and 10 deletions

View File

@ -5,7 +5,6 @@ from __future__ import unicode_literals
import numpy
cimport numpy as np
import cytoolz
from collections import OrderedDict, defaultdict
import srsly
@ -302,7 +301,7 @@ class Pipe(object):
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
for docs in cytoolz.partition_all(batch_size, stream):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensor=tensors)
@ -479,7 +478,7 @@ class Tensorizer(Pipe):
n_threads (int): Number of threads.
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
"""
for docs in cytoolz.partition_all(batch_size, stream):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
tensors = self.predict(docs)
self.set_annotations(docs, tensors)
@ -588,7 +587,7 @@ class Tagger(Pipe):
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
tag_ids, tokvecs = self.predict(docs)
self.set_annotations(docs, tag_ids, tensors=tokvecs)
@ -1073,7 +1072,7 @@ class TextCategorizer(Pipe):
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors)

View File

@ -7,7 +7,6 @@ from __future__ import unicode_literals, print_function
from collections import OrderedDict
import numpy
cimport cython.parallel
import cytoolz
import numpy.random
cimport numpy as np
from cpython.ref cimport PyObject, Py_XDECREF
@ -213,10 +212,10 @@ cdef class Parser:
beam_width = self.cfg.get('beam_width', 1)
beam_density = self.cfg.get('beam_density', 0.)
cdef Doc doc
for batch in cytoolz.partition_all(batch_size, docs):
for batch in util.minibatch(docs, size=batch_size):
batch_in_order = list(batch)
by_length = sorted(batch_in_order, key=lambda doc: len(doc))
for subbatch in cytoolz.partition_all(8, by_length):
for subbatch in util.minibatch(by_length, size=8):
subbatch = list(subbatch)
parse_states = self.predict(subbatch, beam_width=beam_width,
beam_density=beam_density)

View File

@ -11,7 +11,6 @@ from collections import OrderedDict
from thinc.neural._classes.model import Model
from thinc.neural.ops import NumpyOps
import functools
import cytoolz
import itertools
import numpy.random
import srsly
@ -403,7 +402,7 @@ def minibatch(items, size=8):
items = iter(items)
while True:
batch_size = next(size_)
batch = list(cytoolz.take(int(batch_size), items))
batch = list(itertools.islice(items, int(batch_size)))
if len(batch) == 0:
break
yield list(batch)