Remove cytoolz usage from spaCy

2025-10-28 06:31:12 +03:00 · 2018-12-03 02:19:12 +01:00 · 2018-12-03 02:19:12 +01:00 · 1c71fdb805
commit 1c71fdb805
parent a7b085ae46
3 changed files with 7 additions and 10 deletions
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -5,7 +5,6 @@ from __future__ import unicode_literals
 import numpy
 cimport numpy as np
 import cytoolz
 from collections import OrderedDict, defaultdict
 import srsly
@ -302,7 +301,7 @@ class Pipe(object):
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
-        for docs in cytoolz.partition_all(batch_size, stream):
+        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
            scores, tensors = self.predict(docs)
            self.set_annotations(docs, scores, tensor=tensors)
@ -479,7 +478,7 @@ class Tensorizer(Pipe):
        n_threads (int): Number of threads.
        YIELDS (iterator): A sequence of `Doc` objects, in order of input.
        """
-        for docs in cytoolz.partition_all(batch_size, stream):
+        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
            tensors = self.predict(docs)
            self.set_annotations(docs, tensors)
@ -588,7 +587,7 @@ class Tagger(Pipe):
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
-        for docs in cytoolz.partition_all(batch_size, stream):
+        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
            tag_ids, tokvecs = self.predict(docs)
            self.set_annotations(docs, tag_ids, tensors=tokvecs)
@ -1073,7 +1072,7 @@ class TextCategorizer(Pipe):
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
-        for docs in cytoolz.partition_all(batch_size, stream):
+        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
            scores, tensors = self.predict(docs)
            self.set_annotations(docs, scores, tensors=tensors)
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -7,7 +7,6 @@ from __future__ import unicode_literals, print_function
 from collections import OrderedDict
 import numpy
 cimport cython.parallel
 import cytoolz
 import numpy.random
 cimport numpy as np
 from cpython.ref cimport PyObject, Py_XDECREF
@ -213,10 +212,10 @@ cdef class Parser:
            beam_width = self.cfg.get('beam_width', 1)
        beam_density = self.cfg.get('beam_density', 0.)
        cdef Doc doc
-        for batch in cytoolz.partition_all(batch_size, docs):
+        for batch in util.minibatch(docs, size=batch_size):
            batch_in_order = list(batch)
            by_length = sorted(batch_in_order, key=lambda doc: len(doc))
-            for subbatch in cytoolz.partition_all(8, by_length):
+            for subbatch in util.minibatch(by_length, size=8):
                subbatch = list(subbatch)
                parse_states = self.predict(subbatch, beam_width=beam_width,
                                            beam_density=beam_density)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -11,7 +11,6 @@ from collections import OrderedDict
 from thinc.neural._classes.model import Model
 from thinc.neural.ops import NumpyOps
 import functools
 import cytoolz
 import itertools
 import numpy.random
 import srsly
@ -403,7 +402,7 @@ def minibatch(items, size=8):
    items = iter(items)
    while True:
        batch_size = next(size_)
-        batch = list(cytoolz.take(int(batch_size), items))
+        batch = list(itertools.islice(items, int(batch_size)))
        if len(batch) == 0:
            break
        yield list(batch)