From 4e43c0ba93969a7629eaad428d93dba54d830bca Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 4 Nov 2019 20:29:03 +0100
Subject: [PATCH 1/5] Fix multiprocessing for as_tuples=True (#4582)

---
 spacy/language.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/language.py b/spacy/language.py
index d53710f58..97d6515c5 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -769,6 +769,7 @@ class Language(object):
                 texts,
                 batch_size=batch_size,
                 disable=disable,
+                n_process=n_process,
                 component_cfg=component_cfg,
             )
             for doc, context in izip(docs, contexts):

From 4ec76232880066e36ad9b613f934dd7dc66404ea Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 4 Nov 2019 20:31:26 +0100
Subject: [PATCH 2/5] Fix conllu script (#4579)

* force extensions to avoid clash between example scripts

* fix arg order and default file encoding

* add example config for conllu script

* newline

* move extension definitions to main function

* few more encodings fixes
---
 bin/ud/ud_train.py                   | 22 +++++++++-------------
 examples/training/conllu-config.json |  1 +
 examples/training/conllu.py          | 22 ++++++++++------------
 3 files changed, 20 insertions(+), 25 deletions(-)
 create mode 100644 examples/training/conllu-config.json

diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py
index 945bf57eb..2784d7c3c 100644
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@@ -7,7 +7,6 @@ from __future__ import unicode_literals
 import plac
 from pathlib import Path
 import re
-import sys
 import json
 
 import spacy
@@ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words
 from spacy.syntax.nonproj import projectivize
 from spacy.matcher import Matcher
 from spacy import displacy
-from collections import defaultdict, Counter
-from timeit import default_timer as timer
+from collections import defaultdict
 
-import itertools
 import random
-import numpy.random
 
 from spacy import lang
 from spacy.lang import zh
@@ -323,10 +319,6 @@ def get_token_conllu(token, i):
     return "\n".join(lines)
 
 
-Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
-Token.set_extension("begins_fused", default=False, force=True)
-Token.set_extension("inside_fused", default=False, force=True)
-
 
 ##################
 # Initialization #
@@ -459,13 +451,13 @@ class TreebankPaths(object):
 
 @plac.annotations(
     ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
+    parses_dir=("Directory to write the development parses", "positional", None, Path),
     corpus=(
-        "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+        "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
         "positional",
         None,
         str,
     ),
-    parses_dir=("Directory to write the development parses", "positional", None, Path),
     config=("Path to json formatted config file", "option", "C", Path),
     limit=("Size limit", "option", "n", int),
     gpu_device=("Use GPU", "option", "g", int),
@@ -490,6 +482,10 @@ def main(
     # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
     import tqdm
 
+    Token.set_extension("get_conllu_lines", method=get_token_conllu)
+    Token.set_extension("begins_fused", default=False)
+    Token.set_extension("inside_fused", default=False)
+
     spacy.util.fix_random_seed()
     lang.zh.Chinese.Defaults.use_jieba = False
     lang.ja.Japanese.Defaults.use_janome = False
@@ -506,8 +502,8 @@ def main(
 
     docs, golds = read_data(
         nlp,
-        paths.train.conllu.open(),
-        paths.train.text.open(),
+        paths.train.conllu.open(encoding="utf8"),
+        paths.train.text.open(encoding="utf8"),
         max_doc_length=config.max_doc_length,
         limit=limit,
     )
diff --git a/examples/training/conllu-config.json b/examples/training/conllu-config.json
new file mode 100644
index 000000000..9a11dd96b
--- /dev/null
+++ b/examples/training/conllu-config.json
@@ -0,0 +1 @@
+{"nr_epoch": 3, "batch_size": 24, "dropout":  0.001, "vectors":  0, "multitask_tag":  0, "multitask_sent":  0}
diff --git a/examples/training/conllu.py b/examples/training/conllu.py
index dfc790456..d9ee721ec 100644
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@@ -13,8 +13,7 @@ import spacy.util
 from spacy.tokens import Token, Doc
 from spacy.gold import GoldParse
 from spacy.syntax.nonproj import projectivize
-from collections import defaultdict, Counter
-from timeit import default_timer as timer
+from collections import defaultdict
 from spacy.matcher import Matcher
 
 import itertools
@@ -290,11 +289,6 @@ def get_token_conllu(token, i):
     return "\n".join(lines)
 
 
-Token.set_extension("get_conllu_lines", method=get_token_conllu)
-Token.set_extension("begins_fused", default=False)
-Token.set_extension("inside_fused", default=False)
-
-
 ##################
 # Initialization #
 ##################
@@ -381,20 +375,24 @@ class TreebankPaths(object):
 
 @plac.annotations(
     ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
+    parses_dir=("Directory to write the development parses", "positional", None, Path),
+    config=("Path to json formatted config file", "positional", None, Config.load),
     corpus=(
-        "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+        "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
         "positional",
         None,
         str,
     ),
-    parses_dir=("Directory to write the development parses", "positional", None, Path),
-    config=("Path to json formatted config file", "positional", None, Config.load),
     limit=("Size limit", "option", "n", int),
 )
 def main(ud_dir, parses_dir, config, corpus, limit=0):
     # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
     import tqdm
 
+    Token.set_extension("get_conllu_lines", method=get_token_conllu)
+    Token.set_extension("begins_fused", default=False)
+    Token.set_extension("inside_fused", default=False)
+
     paths = TreebankPaths(ud_dir, corpus)
     if not (parses_dir / corpus).exists():
         (parses_dir / corpus).mkdir()
@@ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
 
     docs, golds = read_data(
         nlp,
-        paths.train.conllu.open(),
-        paths.train.text.open(),
+        paths.train.conllu.open(encoding="utf8"),
+        paths.train.text.open(encoding="utf8"),
         max_doc_length=config.max_doc_length,
         limit=limit,
     )

From 83381018d3b165008cf9678117a77ef40c66ce18 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 5 Nov 2019 11:52:43 +0100
Subject: [PATCH 3/5] Add load_from_docbin example [ci skip]

TODO: upload the file somewhere
---
 examples/load_from_docbin.py | 45 ++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 examples/load_from_docbin.py

diff --git a/examples/load_from_docbin.py b/examples/load_from_docbin.py
new file mode 100644
index 000000000..f26e7fc49
--- /dev/null
+++ b/examples/load_from_docbin.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+"""
+Example of loading previously parsed text using spaCy's DocBin class. The example
+performs an entity count to show that the annotations are available.
+For more details, see https://spacy.io/usage/saving-loading#docs
+Installation:
+python -m spacy download en_core_web_lg
+Usage:
+python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
+"""
+from __future__ import unicode_literals
+
+import spacy
+from spacy.tokens import DocBin
+from timeit import default_timer as timer
+from collections import Counter
+
+EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
+
+
+def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
+    nlp = spacy.load(model)
+    print("Reading data from {}".format(docbin_path))
+    with open(docbin_path, "rb") as file_:
+        bytes_data = file_.read()
+    nr_word = 0
+    start_time = timer()
+    entities = Counter()
+    docbin = DocBin().from_bytes(bytes_data)
+    for doc in docbin.get_docs(nlp.vocab):
+        nr_word += len(doc)
+        entities.update((e.label_, e.text) for e in doc.ents)
+    end_time = timer()
+    msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
+    wps = nr_word / (end_time - start_time)
+    print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
+    print("Most common entities:")
+    for (label, entity), freq in entities.most_common(30):
+        print(freq, entity, label)
+
+
+if __name__ == "__main__":
+    import plac
+
+    plac.call(main)

From fed53b1552b935c5beb8500d89a4d411b210d5bc Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 5 Nov 2019 18:26:47 +0100
Subject: [PATCH 4/5] Update README.md

---
 README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/README.md b/README.md
index 99d66bb31..529fa419a 100644
--- a/README.md
+++ b/README.md
@@ -180,9 +180,6 @@ pointing pip to a path or URL.
 # download best-matching version of specific model for your spaCy installation
 python -m spacy download en_core_web_sm
 
-# out-of-the-box: download best-matching default model
-python -m spacy download en
-
 # pip install .tar.gz archive from path or URL
 pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
 pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

From 828ef27a3206a969a56a6dbe47fa38d6e9a1a621 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 5 Nov 2019 18:30:11 +0100
Subject: [PATCH 5/5] Add warnings about 3.8 (resolves #4593) [ci skip]

---
 README.md                   |  7 +++++++
 website/docs/usage/index.md | 11 +++++++++++
 2 files changed, 18 insertions(+)

diff --git a/README.md b/README.md
index 529fa419a..980fc5b0b 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,13 @@ For detailed installation instructions, see the
 [pip]: https://pypi.org/project/spacy/
 [conda]: https://anaconda.org/conda-forge/spacy
 
+> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
+> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
+> providers and other tooling to support it. This means that in order to run
+> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
+> the library and its Cython dependencies locally. If this is causing problems
+> for you, the easiest solution is to **use Python 3.7** in the meantime.
+
 ### pip
 
 Using pip, spaCy releases are available as source packages and binary wheels (as
diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index 441297813..2b0045bc3 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -20,6 +20,17 @@ available over [pip](https://pypi.python.org/pypi/spacy) and
 > possible, the new docs also include notes on features that have changed in
 > v2.0, and features that were introduced in the new version.
 
+<Infobox variant="warning" title="Important note for Python 3.8">
+
+We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8,
+as we're still waiting for our CI providers and other tooling to support it.
+This means that in order to run spaCy on Python 3.8, you'll need
+[a compiler installed](#source) and compile the library and its Cython
+dependencies locally. If this is causing problems for you, the easiest solution
+is to **use Python 3.7** in the meantime.
+
+</Infobox>
+
 ## Quickstart {hidden="true"}
 
 import QuickstartInstall from 'widgets/quickstart-install.js'