mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 05:07:03 +03:00
Update multi_processing example
This commit is contained in:
parent
a137e8b418
commit
582be8746c
|
@ -10,12 +10,13 @@ Compatible with: spaCy v2.0.0+
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function, unicode_literals
|
from __future__ import print_function, unicode_literals
|
||||||
|
|
||||||
from toolz import partition_all
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
|
from functools import partial
|
||||||
import thinc.extra.datasets
|
import thinc.extra.datasets
|
||||||
import plac
|
import plac
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.util import minibatch
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
@ -35,10 +36,10 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10
|
||||||
data, _ = thinc.extra.datasets.imdb()
|
data, _ = thinc.extra.datasets.imdb()
|
||||||
texts, _ = zip(*data[-limit:])
|
texts, _ = zip(*data[-limit:])
|
||||||
print("Processing texts...")
|
print("Processing texts...")
|
||||||
partitions = partition_all(batch_size, texts)
|
partitions = minibatch(texts, size=batch_size)
|
||||||
executor = Parallel(n_jobs=n_jobs)
|
executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
|
||||||
do = delayed(transform_texts)
|
do = delayed(partial(transform_texts, nlp))
|
||||||
tasks = (do(nlp, i, batch, output_dir) for i, batch in enumerate(partitions))
|
tasks = (do(i, batch, output_dir) for i, batch in enumerate(partitions))
|
||||||
executor(tasks)
|
executor(tasks)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user