From 18eaa44835c7a096d3259404e92e946f3a8d505e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Feb 2016 02:48:40 +0100 Subject: [PATCH 1/3] * Add parallel_parse example --- examples/parallel_parse.py | 74 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 examples/parallel_parse.py diff --git a/examples/parallel_parse.py b/examples/parallel_parse.py new file mode 100644 index 000000000..dc8deaf24 --- /dev/null +++ b/examples/parallel_parse.py @@ -0,0 +1,74 @@ +from __future__ import print_function, unicode_literals, division +import io +import bz2 +import logging +from toolz import partition +from os import path +import re + +import spacy.en +from spacy.tokens import Doc + +from joblib import Parallel, delayed +import plac +import ujson + + +def parallelize(func, iterator, n_jobs, extra, backend='multiprocessing'): + extra = tuple(extra) + return Parallel(n_jobs=n_jobs, backend=backend)(delayed(func)(*(item + extra)) + for item in iterator) + + +def iter_comments(loc): + with bz2.BZ2File(loc) as file_: + for i, line in enumerate(file_): + yield ujson.loads(line)['body'] + + +pre_format_re = re.compile(r'^[\`\*\~]') +post_format_re = re.compile(r'[\`\*\~]$') +url_re = re.compile(r'\[([^]]+)\]\(%%URL\)') +link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)') +def strip_meta(text): + text = link_re.sub(r'\1', text) + text = text.replace('>', '>').replace('<', '<') + text = pre_format_re.sub('', text) + text = post_format_re.sub('', text) + return text.strip() + + +def save_parses(batch_id, input_, out_dir, n_threads, batch_size): + out_loc = path.join(out_dir, '%d.bin' % batch_id) + if path.exists(out_loc): + return None + print('Batch', batch_id) + nlp = spacy.en.English(parser=False) + nlp.matcher = None + with open(out_loc, 'wb') as file_: + texts = (strip_meta(text) for text in input_) + texts = (text for text in texts if text.strip()) + for doc in nlp.pipe(texts, batch_size=batch_size, n_threads=n_threads): + file_.write(doc.to_bytes()) + +@plac.annotations( + in_loc=("Location of input file"), + out_dir=("Location of input file"), + n_process=("Number of processes", "option", "p", int), + n_thread=("Number of threads per process", "option", "t", int), + batch_size=("Number of texts to accumulate in a buffer", "option", "b", int) +) +def main(in_loc, out_dir, n_process=1, n_thread=4): + if not path.exists(out_dir): + path.join(out_dir) + if n_process >= 2: + texts = partition(200000, iter_comments(in_loc)) + parallelize(save_parses, enumerate(texts), n_process, [out_dir, n_thread, batch_size], + backend='multiprocessing') + else: + save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size) + + + +if __name__ == '__main__': + plac.call(main) From dc61056183bb84ca90b5ffb6a07f92d8bfaad394 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Feb 2016 02:56:16 +0100 Subject: [PATCH 2/3] * Fix parallel_parse script --- examples/parallel_parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/parallel_parse.py b/examples/parallel_parse.py index dc8deaf24..45e14d845 100644 --- a/examples/parallel_parse.py +++ b/examples/parallel_parse.py @@ -43,7 +43,7 @@ def save_parses(batch_id, input_, out_dir, n_threads, batch_size): if path.exists(out_loc): return None print('Batch', batch_id) - nlp = spacy.en.English(parser=False) + nlp = spacy.en.English() nlp.matcher = None with open(out_loc, 'wb') as file_: texts = (strip_meta(text) for text in input_) From 5d96b3ef4f272df1d976ae6366e8c7a1b547aca7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Feb 2016 13:48:58 +0100 Subject: [PATCH 3/3] * Increment version --- spacy/about.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index 96f67e45c..d01cf8f69 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,11 +4,11 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __name__ = 'spacy' -__version__ = '0.100.3' +__version__ = '0.100.5' __summary__ = 'Industrial-strength NLP' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' __email__ = 'matt@spacy.io' __license__ = 'MIT' -__release__ = False +__release__ = True __default_model__ = 'en_default'