From ae2c208735a9ee033f0eaaae9ea6fa203f30ef68 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 20 Jun 2019 10:36:38 +0200
Subject: [PATCH 1/5] Auto-format [ci skip]

---
 spacy/cli/pretrain.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 7afd10520..2fe5b247a 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -23,19 +23,39 @@ from .train import _load_pretrained_tok2vec
 
 
 @plac.annotations(
-    texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
-               "key 'tokens'", "positional", None, str),
+    texts_loc=(
+        "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
+        "key 'tokens'",
+        "positional",
+        None,
+        str,
+    ),
     vectors_model=("Name or path to spaCy model with vectors to learn from"),
     output_dir=("Directory to write models to on each epoch", "positional", None, str),
     width=("Width of CNN layers", "option", "cw", int),
     depth=("Depth of CNN layers", "option", "cd", int),
     embed_rows=("Number of embedding rows", "option", "er", int),
-    loss_func=("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str),
+    loss_func=(
+        "Loss function to use for the objective. Either 'L2' or 'cosine'",
+        "option",
+        "L",
+        str,
+    ),
     use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
     dropout=("Dropout rate", "option", "d", float),
     batch_size=("Number of words per training batch", "option", "bs", int),
-    max_length=("Max words per example. Longer examples are discarded", "option", "xw", int),
-    min_length=("Min words per example. Shorter examples are discarded", "option", "nw", int),
+    max_length=(
+        "Max words per example. Longer examples are discarded",
+        "option",
+        "xw",
+        int,
+    ),
+    min_length=(
+        "Min words per example. Shorter examples are discarded",
+        "option",
+        "nw",
+        int,
+    ),
     seed=("Seed for random number generators", "option", "s", int),
     n_iter=("Number of iterations to pretrain", "option", "i", int),
     n_save_every=("Save model every X batches.", "option", "se", int),

From c833d9b31487fec9a5edb9c4a7566f8b5de17213 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 20 Jun 2019 17:48:45 +0200
Subject: [PATCH 2/5] Add "v.s." to English tokenizer exceptions (see #3868)

---
 spacy/lang/en/tokenizer_exceptions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index 5063319a6..9731dc752 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -537,6 +537,7 @@ for orth in [
     "Sen.",
     "St.",
     "vs.",
+    "v.s."
 ]:
     _exc[orth] = [{ORTH: orth}]
 

From f22704621ef5d136e00a47068288bf55f666716d Mon Sep 17 00:00:00 2001
From: Bram Vanroy <Bram.Vanroy@UGent.be>
Date: Mon, 24 Jun 2019 11:03:16 +0200
Subject: [PATCH 3/5] Update CITATION (#3873)

As discussed in https://github.com/explosion/spaCy/pull/2167 the citation should look slightly different.
---
 CITATION | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/CITATION b/CITATION
index 301224955..e820c197d 100644
--- a/CITATION
+++ b/CITATION
@@ -1,6 +1,6 @@
-@ARTICLE{spacy2,
-   AUTHOR  = {Honnibal, Matthew AND Montani, Ines},
-   TITLE   = {spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing},
-   YEAR    = {2017},
-   JOURNAL = {To appear}
+@unpublished{spacy2,
+    AUTHOR = {Honnibal, Matthew and Montani, Ines},
+    TITLE  = {{spaCy 2}: Natural language understanding with {B}loom embeddings, convolutional neural networks and incremental parsing},
+    YEAR   = {2017},
+    Note   = {To appear}
 }

From 6ccdf37574ec4fdfc03a746e07dcd38a81b3900b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 26 Jun 2019 14:37:05 +0200
Subject: [PATCH 4/5] Exclude user_data when copying doc in displaCy (closes
 #3882)

---
 spacy/displacy/__init__.py               |  2 +-
 spacy/tests/regression/test_issue3882.py | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/regression/test_issue3882.py

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index b651c0996..d2ef21dbd 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -116,7 +116,7 @@ def parse_deps(orig_doc, options={}):
     doc (Doc): Document do parse.
     RETURNS (dict): Generated dependency parse keyed by words and arcs.
     """
-    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
+    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
     if not doc.is_parsed:
         user_warning(Warnings.W005)
     if options.get("collapse_phrases", False):
diff --git a/spacy/tests/regression/test_issue3882.py b/spacy/tests/regression/test_issue3882.py
new file mode 100644
index 000000000..1b2dcea25
--- /dev/null
+++ b/spacy/tests/regression/test_issue3882.py
@@ -0,0 +1,15 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy.displacy import parse_deps
+from spacy.tokens import Doc
+
+
+def test_issue3882(en_vocab):
+    """Test that displaCy doesn't serialize the doc.user_data when making a
+    copy of the Doc.
+    """
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    doc.is_parsed = True
+    doc.user_data["test"] = set()
+    parse_deps(doc)

From d361e380b85ca3e907af9d9a8a720f6dc4bbcf27 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 26 Jun 2019 14:47:26 +0200
Subject: [PATCH 5/5] Fix matcher callback example (closes #3862)

---
 website/docs/usage/rule-based-matching.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index ee901e3fd..2354092f0 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -305,11 +305,11 @@ match on the uppercase versions, in case someone has written it as "Google i/o".
 
 ```python
 ### {executable="true"}
-import spacy
+from spacy.lang.en import English
 from spacy.matcher import Matcher
 from spacy.tokens import Span
 
-nlp = spacy.load("en_core_web_sm")
+nlp = English()
 matcher = Matcher(nlp.vocab)
 
 def add_event_ent(matcher, doc, i, matches):
@@ -322,7 +322,7 @@ def add_event_ent(matcher, doc, i, matches):
 
 pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
 matcher.add("GoogleIO", add_event_ent, pattern)
-doc = nlp(u"This is a text about Google I/O.")
+doc = nlp(u"This is a text about Google I/O")
 matches = matcher(doc)
 ```