From 18eaa44835c7a096d3259404e92e946f3a8d505e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Feb 2016 02:48:40 +0100
Subject: [PATCH 1/3] * Add parallel_parse example

---
 examples/parallel_parse.py | 74 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 examples/parallel_parse.py

diff --git a/examples/parallel_parse.py b/examples/parallel_parse.py
new file mode 100644
index 000000000..dc8deaf24
--- /dev/null
+++ b/examples/parallel_parse.py
@@ -0,0 +1,74 @@
+from __future__ import print_function, unicode_literals, division
+import io
+import bz2
+import logging
+from toolz import partition
+from os import path
+import re
+
+import spacy.en
+from spacy.tokens import Doc
+
+from joblib import Parallel, delayed
+import plac
+import ujson
+
+
+def parallelize(func, iterator, n_jobs, extra, backend='multiprocessing'):
+    extra = tuple(extra)
+    return Parallel(n_jobs=n_jobs, backend=backend)(delayed(func)(*(item + extra))
+                    for item in iterator)
+
+
+def iter_comments(loc):
+    with bz2.BZ2File(loc) as file_:
+        for i, line in enumerate(file_):
+            yield ujson.loads(line)['body']
+
+
+pre_format_re = re.compile(r'^[\`\*\~]')
+post_format_re = re.compile(r'[\`\*\~]$')
+url_re = re.compile(r'\[([^]]+)\]\(%%URL\)')
+link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)')
+def strip_meta(text):
+    text = link_re.sub(r'\1', text)
+    text = text.replace('&gt;', '>').replace('&lt;', '<')
+    text = pre_format_re.sub('', text)
+    text = post_format_re.sub('', text)
+    return text.strip()
+
+
+def save_parses(batch_id, input_, out_dir, n_threads, batch_size):
+    out_loc = path.join(out_dir, '%d.bin' % batch_id)
+    if path.exists(out_loc):
+        return None
+    print('Batch', batch_id)
+    nlp = spacy.en.English(parser=False)
+    nlp.matcher = None
+    with open(out_loc, 'wb') as file_:
+        texts = (strip_meta(text) for text in input_)
+        texts = (text for text in texts if text.strip())
+        for doc in nlp.pipe(texts, batch_size=batch_size, n_threads=n_threads):
+            file_.write(doc.to_bytes())
+
+@plac.annotations(
+    in_loc=("Location of input file"),
+    out_dir=("Location of input file"),
+    n_process=("Number of processes", "option", "p", int),
+    n_thread=("Number of threads per process", "option", "t", int),
+    batch_size=("Number of texts to accumulate in a buffer", "option", "b", int)
+)
+def main(in_loc, out_dir, n_process=1, n_thread=4):
+    if not path.exists(out_dir):
+        path.join(out_dir)
+    if n_process >= 2:
+        texts = partition(200000, iter_comments(in_loc))
+        parallelize(save_parses, enumerate(texts), n_process, [out_dir, n_thread, batch_size],
+                   backend='multiprocessing')
+    else:
+        save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size)
+
+
+
+if __name__ == '__main__':
+    plac.call(main)

From dc61056183bb84ca90b5ffb6a07f92d8bfaad394 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Feb 2016 02:56:16 +0100
Subject: [PATCH 2/3] * Fix parallel_parse script

---
 examples/parallel_parse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/parallel_parse.py b/examples/parallel_parse.py
index dc8deaf24..45e14d845 100644
--- a/examples/parallel_parse.py
+++ b/examples/parallel_parse.py
@@ -43,7 +43,7 @@ def save_parses(batch_id, input_, out_dir, n_threads, batch_size):
     if path.exists(out_loc):
         return None
     print('Batch', batch_id)
-    nlp = spacy.en.English(parser=False)
+    nlp = spacy.en.English()
     nlp.matcher = None
     with open(out_loc, 'wb') as file_:
         texts = (strip_meta(text) for text in input_)

From 5d96b3ef4f272df1d976ae6366e8c7a1b547aca7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 7 Feb 2016 13:48:58 +0100
Subject: [PATCH 3/3] * Increment version

---
 spacy/about.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index 96f67e45c..d01cf8f69 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -4,11 +4,11 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 
 __name__ = 'spacy'
-__version__ = '0.100.3'
+__version__ = '0.100.5'
 __summary__ = 'Industrial-strength NLP'
 __uri__ = 'https://spacy.io'
 __author__ = 'Matthew Honnibal'
 __email__ = 'matt@spacy.io'
 __license__ = 'MIT'
-__release__ = False
+__release__ = True
 __default_model__ = 'en_default'