Remove cytoolz usage from spaCy

This commit is contained in:
Matthew Honnibal 2018-12-03 02:19:12 +01:00
parent a7b085ae46
commit 1c71fdb805
3 changed files with 7 additions and 10 deletions

View File

@ -5,7 +5,6 @@ from __future__ import unicode_literals
import numpy import numpy
cimport numpy as np cimport numpy as np
import cytoolz
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
import srsly import srsly
@ -302,7 +301,7 @@ class Pipe(object):
Both __call__ and pipe should delegate to the `predict()` Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods. and `set_annotations()` methods.
""" """
for docs in cytoolz.partition_all(batch_size, stream): for docs in util.minibatch(stream, size=batch_size):
docs = list(docs) docs = list(docs)
scores, tensors = self.predict(docs) scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensor=tensors) self.set_annotations(docs, scores, tensor=tensors)
@ -479,7 +478,7 @@ class Tensorizer(Pipe):
n_threads (int): Number of threads. n_threads (int): Number of threads.
YIELDS (iterator): A sequence of `Doc` objects, in order of input. YIELDS (iterator): A sequence of `Doc` objects, in order of input.
""" """
for docs in cytoolz.partition_all(batch_size, stream): for docs in util.minibatch(stream, size=batch_size):
docs = list(docs) docs = list(docs)
tensors = self.predict(docs) tensors = self.predict(docs)
self.set_annotations(docs, tensors) self.set_annotations(docs, tensors)
@ -588,7 +587,7 @@ class Tagger(Pipe):
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream): for docs in util.minibatch(stream, size=batch_size):
docs = list(docs) docs = list(docs)
tag_ids, tokvecs = self.predict(docs) tag_ids, tokvecs = self.predict(docs)
self.set_annotations(docs, tag_ids, tensors=tokvecs) self.set_annotations(docs, tag_ids, tensors=tokvecs)
@ -1073,7 +1072,7 @@ class TextCategorizer(Pipe):
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream): for docs in util.minibatch(stream, size=batch_size):
docs = list(docs) docs = list(docs)
scores, tensors = self.predict(docs) scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors) self.set_annotations(docs, scores, tensors=tensors)

View File

@ -7,7 +7,6 @@ from __future__ import unicode_literals, print_function
from collections import OrderedDict from collections import OrderedDict
import numpy import numpy
cimport cython.parallel cimport cython.parallel
import cytoolz
import numpy.random import numpy.random
cimport numpy as np cimport numpy as np
from cpython.ref cimport PyObject, Py_XDECREF from cpython.ref cimport PyObject, Py_XDECREF
@ -213,10 +212,10 @@ cdef class Parser:
beam_width = self.cfg.get('beam_width', 1) beam_width = self.cfg.get('beam_width', 1)
beam_density = self.cfg.get('beam_density', 0.) beam_density = self.cfg.get('beam_density', 0.)
cdef Doc doc cdef Doc doc
for batch in cytoolz.partition_all(batch_size, docs): for batch in util.minibatch(docs, size=batch_size):
batch_in_order = list(batch) batch_in_order = list(batch)
by_length = sorted(batch_in_order, key=lambda doc: len(doc)) by_length = sorted(batch_in_order, key=lambda doc: len(doc))
for subbatch in cytoolz.partition_all(8, by_length): for subbatch in util.minibatch(by_length, size=8):
subbatch = list(subbatch) subbatch = list(subbatch)
parse_states = self.predict(subbatch, beam_width=beam_width, parse_states = self.predict(subbatch, beam_width=beam_width,
beam_density=beam_density) beam_density=beam_density)

View File

@ -11,7 +11,6 @@ from collections import OrderedDict
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
from thinc.neural.ops import NumpyOps from thinc.neural.ops import NumpyOps
import functools import functools
import cytoolz
import itertools import itertools
import numpy.random import numpy.random
import srsly import srsly
@ -403,7 +402,7 @@ def minibatch(items, size=8):
items = iter(items) items = iter(items)
while True: while True:
batch_size = next(size_) batch_size = next(size_)
batch = list(cytoolz.take(int(batch_size), items)) batch = list(itertools.islice(items, int(batch_size)))
if len(batch) == 0: if len(batch) == 0:
break break
yield list(batch) yield list(batch)