Tidy up references to n_threads and fix default

This commit is contained in:
Ines Montani 2019-03-15 16:24:26 +01:00
parent 852e1f105c
commit cb5dbfa63a
7 changed files with 7 additions and 17 deletions

View File

@ -49,7 +49,7 @@ class SentimentAnalyser(object):
y = self._model.predict(X) y = self._model.predict(X)
self.set_sentiment(doc, y) self.set_sentiment(doc, y)
def pipe(self, docs, batch_size=1000, n_threads=2): def pipe(self, docs, batch_size=1000):
for minibatch in cytoolz.partition_all(batch_size, docs): for minibatch in cytoolz.partition_all(batch_size, docs):
minibatch = list(minibatch) minibatch = list(minibatch)
sentences = [] sentences = []
@ -176,7 +176,7 @@ def evaluate(model_dir, texts, labels, max_length=100):
correct = 0 correct = 0
i = 0 i = 0
for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): for doc in nlp.pipe(texts, batch_size=1000):
correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
i += 1 i += 1
return float(correct) / i return float(correct) / i

View File

@ -644,7 +644,7 @@ class Language(object):
self, self,
texts, texts,
as_tuples=False, as_tuples=False,
n_threads=2, n_threads=-1,
batch_size=1000, batch_size=1000,
disable=[], disable=[],
cleanup=False, cleanup=False,
@ -656,7 +656,6 @@ class Language(object):
as_tuples (bool): If set to True, inputs should be a sequence of as_tuples (bool): If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of (text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False. (doc, context) tuples. Defaults to False.
n_threads (int): Currently inactive.
batch_size (int): The number of texts to buffer. batch_size (int): The number of texts to buffer.
disable (list): Names of the pipeline components to disable. disable (list): Names of the pipeline components to disable.
cleanup (bool): If True, unneeded strings are freed to control memory cleanup (bool): If True, unneeded strings are freed to control memory
@ -673,7 +672,6 @@ class Language(object):
contexts = (tc[1] for tc in text_context2) contexts = (tc[1] for tc in text_context2)
docs = self.pipe( docs = self.pipe(
texts, texts,
n_threads=n_threads,
batch_size=batch_size, batch_size=batch_size,
disable=disable, disable=disable,
component_cfg=component_cfg, component_cfg=component_cfg,
@ -690,7 +688,6 @@ class Language(object):
kwargs = component_cfg.get(name, {}) kwargs = component_cfg.get(name, {})
# Allow component_cfg to overwrite the top-level kwargs. # Allow component_cfg to overwrite the top-level kwargs.
kwargs.setdefault("batch_size", batch_size) kwargs.setdefault("batch_size", batch_size)
kwargs.setdefault("n_threads", n_threads)
if hasattr(proc, "pipe"): if hasattr(proc, "pipe"):
docs = proc.pipe(docs, **kwargs) docs = proc.pipe(docs, **kwargs)
else: else:

View File

@ -153,13 +153,11 @@ cdef class Matcher:
return default return default
return (self._callbacks[key], self._patterns[key]) return (self._callbacks[key], self._patterns[key])
def pipe(self, docs, batch_size=1000, n_threads=2): def pipe(self, docs, batch_size=1000, n_threads=-1):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents. docs (iterable): A stream of documents.
batch_size (int): Number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the implementation supports multi-threading.
YIELDS (Doc): Documents, in order. YIELDS (Doc): Documents, in order.
""" """
for doc in docs: for doc in docs:

View File

@ -166,14 +166,12 @@ cdef class PhraseMatcher:
on_match(self, doc, i, matches) on_match(self, doc, i, matches)
return matches return matches
def pipe(self, stream, batch_size=1000, n_threads=1, return_matches=False, def pipe(self, stream, batch_size=1000, n_threads=-1, return_matches=False,
as_tuples=False): as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents. docs (iterable): A stream of documents.
batch_size (int): Number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the implementation supports multi-threading.
return_matches (bool): Yield the match lists along with the docs, making return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples. results (doc, matches) tuples.
as_tuples (bool): Interpret the input stream as (doc, context) tuples, as_tuples (bool): Interpret the input stream as (doc, context) tuples,

View File

@ -257,7 +257,6 @@ class Tensorizer(Pipe):
stream (iterator): A sequence of `Doc` objects to process. stream (iterator): A sequence of `Doc` objects to process.
batch_size (int): Number of `Doc` objects to group. batch_size (int): Number of `Doc` objects to group.
n_threads (int): Number of threads.
YIELDS (iterator): A sequence of `Doc` objects, in order of input. YIELDS (iterator): A sequence of `Doc` objects, in order of input.
""" """
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):

View File

@ -205,13 +205,11 @@ cdef class Parser:
self.set_annotations([doc], states, tensors=None) self.set_annotations([doc], states, tensors=None)
return doc return doc
def pipe(self, docs, int batch_size=256, int n_threads=2, beam_width=None): def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None):
"""Process a stream of documents. """Process a stream of documents.
stream: The sequence of documents to process. stream: The sequence of documents to process.
batch_size (int): Number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel.
YIELDS (Doc): Documents, in order. YIELDS (Doc): Documents, in order.
""" """
if beam_width is None: if beam_width is None:

View File

@ -125,7 +125,7 @@ cdef class Tokenizer:
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
return doc return doc
def pipe(self, texts, batch_size=1000, n_threads=2): def pipe(self, texts, batch_size=1000, n_threads=-1):
"""Tokenize a stream of texts. """Tokenize a stream of texts.
texts: A sequence of unicode texts. texts: A sequence of unicode texts.