From 66a4834e56389a915cf70d5089f0c2b756467b6a Mon Sep 17 00:00:00 2001
From: Karen Hambardzumyan <mahnerak@gmail.com>
Date: Mon, 22 Jun 2020 10:50:34 +0400
Subject: [PATCH 01/43] Some changes for Armenian (#5616)

* Fixing numericals

* We need a Armenian question sign to make the sentence a question
---
 spacy/lang/hy/examples.py  |  2 +-
 spacy/lang/hy/lex_attrs.py | 17 +++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py
index 323f77b1c..8a00fd243 100644
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@@ -11,6 +11,6 @@ Example sentences to test spaCy and its language models.
 sentences = [
     "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
     "Ո՞վ է Ֆրանսիայի նախագահը։",
-    "Որն է Միացյալ Նահանգների մայրաքաղաքը։",
+    "Ո՞րն է Միացյալ Նահանգների մայրաքաղաքը։",
     "Ե՞րբ է ծնվել Բարաք Օբաման։",
 ]
diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py
index b556d679c..dea3c0e97 100644
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@@ -18,14 +18,15 @@ _num_words = [
     "տասը",
     "տասնմեկ",
     "տասներկու",
-    "տասն­երեք",
-    "տասն­չորս",
-    "տասն­հինգ",
-    "տասն­վեց",
-    "տասն­յոթ",
-    "տասն­ութ",
-    "տասն­ինը",
-    "քսան" "երեսուն",
+    "տասներեք",
+    "տասնչորս",
+    "տասնհինգ",
+    "տասնվեց",
+    "տասնյոթ",
+    "տասնութ",
+    "տասնինը",
+    "քսան",
+    "երեսուն",
     "քառասուն",
     "հիսուն",
     "վաթսուն",

From c34420794acd4e3b656332c430a41252d33a9722 Mon Sep 17 00:00:00 2001
From: Rameshh <30867740+rameshhpathak@users.noreply.github.com>
Date: Mon, 22 Jun 2020 14:10:46 +0545
Subject: [PATCH 02/43] Add Nepali Language  (#5622)

* added support for nepali lang

* added examples and test files

* added spacy contributor agreement
---
 .github/contributors/rameshhpathak.md | 106 ++++++
 spacy/lang/ne/__init__.py             |  23 ++
 spacy/lang/ne/examples.py             |  22 ++
 spacy/lang/ne/lex_attrs.py            |  98 +++++
 spacy/lang/ne/stop_words.py           | 498 ++++++++++++++++++++++++++
 spacy/tests/conftest.py               |   5 +
 spacy/tests/lang/ne/__init__.py       |   0
 spacy/tests/lang/ne/test_text.py      |  19 +
 8 files changed, 771 insertions(+)
 create mode 100644 .github/contributors/rameshhpathak.md
 create mode 100644 spacy/lang/ne/__init__.py
 create mode 100644 spacy/lang/ne/examples.py
 create mode 100644 spacy/lang/ne/lex_attrs.py
 create mode 100644 spacy/lang/ne/stop_words.py
 create mode 100644 spacy/tests/lang/ne/__init__.py
 create mode 100644 spacy/tests/lang/ne/test_text.py

diff --git a/.github/contributors/rameshhpathak.md b/.github/contributors/rameshhpathak.md
new file mode 100644
index 000000000..30a543307
--- /dev/null
+++ b/.github/contributors/rameshhpathak.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Ramesh Pathak        |
+| Company name (if applicable)   | Diyo AI              |
+| Title or role (if applicable)  | AI Engineer          |
+| Date                           | June 21, 2020        |
+| GitHub username                | rameshhpathak        |
+| Website (optional)             |rameshhpathak.github.io|                      |
diff --git a/spacy/lang/ne/__init__.py b/spacy/lang/ne/__init__.py
new file mode 100644
index 000000000..21556277d
--- /dev/null
+++ b/spacy/lang/ne/__init__.py
@@ -0,0 +1,23 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+
+from ...language import Language
+from ...attrs import LANG
+
+
+class NepaliDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
+    lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code
+    stop_words = STOP_WORDS
+
+
+class Nepali(Language):
+    lang = "ne"
+    Defaults = NepaliDefaults
+
+
+__all__ = ["Nepali"]
diff --git a/spacy/lang/ne/examples.py b/spacy/lang/ne/examples.py
new file mode 100644
index 000000000..b3c4f9e73
--- /dev/null
+++ b/spacy/lang/ne/examples.py
@@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ne.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "एप्पलले अमेरिकी स्टार्टअप १ अर्ब डलरमा किन्ने सोच्दै छ",
+    "स्वायत्त कारहरूले बीमा दायित्व निर्माताहरु तिर बदल्छन्",
+    "स्यान फ्रांसिस्कोले फुटपाथ वितरण रोबोटहरु प्रतिबंध गर्ने विचार गर्दै छ",
+    "लन्डन यूनाइटेड किंगडमको एक ठूलो शहर हो।",
+    "तिमी कहाँ छौ?",
+    "फ्रान्स को राष्ट्रपति को हो?",
+    "संयुक्त राज्यको राजधानी के हो?",
+    "बराक ओबामा कहिले कहिले जन्मेका हुन्?",
+]
diff --git a/spacy/lang/ne/lex_attrs.py b/spacy/lang/ne/lex_attrs.py
new file mode 100644
index 000000000..652307577
--- /dev/null
+++ b/spacy/lang/ne/lex_attrs.py
@@ -0,0 +1,98 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..norm_exceptions import BASE_NORMS
+from ...attrs import NORM, LIKE_NUM
+
+
+# fmt: off
+_stem_suffixes = [
+    ["ा", "ि", "ी", "ु", "ू", "ृ", "े", "ै", "ो", "ौ"],
+    ["ँ", "ं", "्", "ः"],
+    ["लाई", "ले", "बाट", "को", "मा", "हरू"],
+    ["हरूलाई", "हरूले", "हरूबाट", "हरूको", "हरूमा"],
+    ["इलो", "िलो", "नु", "ाउनु", "ई", "इन", "इन्", "इनन्"],
+    ["एँ", "इँन्", "इस्", "इनस्", "यो", "एन", "यौं", "एनौं", "ए", "एनन्"],
+    ["छु", "छौँ", "छस्", "छौ", "छ", "छन्", "छेस्", "छे", "छ्यौ", "छिन्", "हुन्छ"],
+    ["दै", "दिन", "दिँन", "दैनस्", "दैन", "दैनौँ", "दैनौं", "दैनन्"],
+    ["हुन्न", "न्न", "न्न्स्", "न्नौं", "न्नौ", "न्न्न्", "िई"],
+    ["अ", "ओ", "ऊ", "अरी", "साथ", "वित्तिकै", "पूर्वक"],
+    ["याइ", "ाइ", "बार", "वार", "चाँहि"],
+    ["ने", "ेको", "ेकी", "ेका", "ेर", "दै", "तै", "िकन", "उ", "न", "नन्"]
+]
+# fmt: on
+
+# reference 1: https://en.wikipedia.org/wiki/Numbers_in_Nepali_language
+# reference 2: https://www.imnepal.com/nepali-numbers/
+_num_words = [
+    "शुन्य",
+    "एक",
+    "दुई",
+    "तीन",
+    "चार",
+    "पाँच",
+    "छ",
+    "सात",
+    "आठ",
+    "नौ",
+    "दश",
+    "एघार",
+    "बाह्र",
+    "तेह्र",
+    "चौध",
+    "पन्ध्र",
+    "सोह्र",
+    "सोह्र",
+    "सत्र",
+    "अठार",
+    "उन्नाइस",
+    "बीस",
+    "तीस",
+    "चालीस",
+    "पचास",
+    "साठी",
+    "सत्तरी",
+    "असी",
+    "नब्बे",
+    "सय",
+    "हजार",
+    "लाख",
+    "करोड",
+    "अर्ब",
+    "खर्ब",
+]
+
+
+def norm(string):
+    # normalise base exceptions,  e.g. punctuation or currency symbols
+    if string in BASE_NORMS:
+        return BASE_NORMS[string]
+    # set stem word as norm,  if available,  adapted from:
+    # https://github.com/explosion/spaCy/blob/master/spacy/lang/hi/lex_attrs.py
+    # https://www.researchgate.net/publication/237261579_Structure_of_Nepali_Grammar
+    for suffix_group in reversed(_stem_suffixes):
+        length = len(suffix_group[0])
+        if len(string) <= length:
+            break
+        for suffix in suffix_group:
+            if string.endswith(suffix):
+                return string[:-length]
+    return string
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(", ", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text.lower() in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {NORM: norm, LIKE_NUM: like_num}
diff --git a/spacy/lang/ne/stop_words.py b/spacy/lang/ne/stop_words.py
new file mode 100644
index 000000000..f008697d0
--- /dev/null
+++ b/spacy/lang/ne/stop_words.py
@@ -0,0 +1,498 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt
+
+STOP_WORDS = set(
+    """
+अक्सर
+अगाडि
+अगाडी
+अघि
+अझै
+अठार
+अथवा
+अनि
+अनुसार
+अन्तर्गत
+अन्य
+अन्यत्र
+अन्यथा
+अब
+अरु
+अरुलाई
+अरू
+अर्को
+अर्थात
+अर्थात्
+अलग
+अलि
+अवस्था
+अहिले
+आए
+आएका
+आएको
+आज
+आजको
+आठ
+आत्म
+आदि
+आदिलाई
+आफनो
+आफू
+आफूलाई
+आफै
+आफैँ
+आफ्नै
+आफ्नो
+आयो
+उ
+उक्त
+उदाहरण
+उनको
+उनलाई
+उनले
+उनि
+उनी
+उनीहरुको
+उन्नाइस
+उप
+उसको
+उसलाई
+उसले
+उहालाई
+ऊ
+एउटा
+एउटै
+एक
+एकदम
+एघार
+ओठ
+औ
+औं
+कता
+कति
+कतै
+कम
+कमसेकम
+कसरि
+कसरी
+कसै
+कसैको
+कसैलाई
+कसैले
+कसैसँग
+कस्तो
+कहाँबाट
+कहिलेकाहीं
+का
+काम
+कारण
+कि
+किन
+किनभने
+कुन
+कुनै
+कुन्नी
+कुरा
+कृपया
+के
+केहि
+केही
+को
+कोहि
+कोहिपनि
+कोही
+कोहीपनि
+क्रमशः
+गए
+गएको
+गएर
+गयौ
+गरि
+गरी
+गरे
+गरेका
+गरेको
+गरेर
+गरौं
+गर्छ
+गर्छन्
+गर्छु
+गर्दा
+गर्दै
+गर्न
+गर्नु
+गर्नुपर्छ
+गर्ने
+गैर
+घर
+चार
+चाले
+चाहनुहुन्छ
+चाहन्छु
+चाहिं
+चाहिए
+चाहिंले
+चाहीं
+चाहेको
+चाहेर
+चोटी
+चौथो
+चौध
+छ
+छन
+छन्
+छु
+छू
+छैन
+छैनन्
+छौ
+छौं
+जता
+जताततै
+जना
+जनाको
+जनालाई
+जनाले
+जब
+जबकि
+जबकी
+जसको
+जसबाट
+जसमा
+जसरी
+जसलाई
+जसले
+जस्ता
+जस्तै
+जस्तो
+जस्तोसुकै
+जहाँ
+जान
+जाने
+जाहिर
+जुन
+जुनै
+जे
+जो
+जोपनि
+जोपनी
+झैं
+ठाउँमा
+ठीक
+ठूलो
+त
+तता
+तत्काल
+तथा
+तथापि
+तथापी
+तदनुसार
+तपाइ
+तपाई
+तपाईको
+तब
+तर
+तर्फ
+तल
+तसरी
+तापनि
+तापनी
+तिन
+तिनि
+तिनिहरुलाई
+तिनी
+तिनीहरु
+तिनीहरुको
+तिनीहरू
+तिनीहरूको
+तिनै
+तिमी
+तिर
+तिरको
+ती
+तीन
+तुरन्त
+तुरुन्त
+तुरुन्तै
+तेश्रो
+तेस्कारण
+तेस्रो
+तेह्र
+तैपनि
+तैपनी
+त्यत्तिकै
+त्यत्तिकैमा
+त्यस
+त्यसकारण
+त्यसको
+त्यसले
+त्यसैले
+त्यसो
+त्यस्तै
+त्यस्तो
+त्यहाँ
+त्यहिँ
+त्यही
+त्यहीँ
+त्यहीं
+त्यो
+त्सपछि
+त्सैले
+थप
+थरि
+थरी
+थाहा
+थिए
+थिएँ
+थिएन
+थियो
+दर्ता
+दश
+दिए
+दिएको
+दिन
+दिनुभएको
+दिनुहुन्छ
+दुइ
+दुइवटा
+दुई
+देखि
+देखिन्छ
+देखियो
+देखे
+देखेको
+देखेर
+दोश्री
+दोश्रो
+दोस्रो
+द्वारा
+धन्न
+धेरै
+धौ
+न
+नगर्नु
+नगर्नू
+नजिकै
+नत्र
+नत्रभने
+नभई
+नभएको
+नभनेर
+नयाँ
+नि
+निकै
+निम्ति
+निम्न
+निम्नानुसार
+निर्दिष्ट
+नै
+नौ
+पक्का
+पक्कै
+पछाडि
+पछाडी
+पछि
+पछिल्लो
+पछी
+पटक
+पनि
+पन्ध्र
+पर्छ
+पर्थ्यो
+पर्दैन
+पर्ने
+पर्नेमा
+पर्याप्त
+पहिले
+पहिलो
+पहिल्यै
+पाँच
+पांच
+पाचौँ
+पाँचौं
+पिच्छे
+पूर्व
+पो
+प्रति
+प्रतेक
+प्रत्यक
+प्राय
+प्लस
+फरक
+फेरि
+फेरी
+बढी
+बताए
+बने
+बरु
+बाट
+बारे
+बाहिर
+बाहेक
+बाह्र
+बिच
+बिचमा
+बिरुद्ध
+बिशेष
+बिस
+बीच
+बीचमा
+बीस
+भए
+भएँ
+भएका
+भएकालाई
+भएको
+भएन
+भएर
+भन
+भने
+भनेको
+भनेर
+भन्
+भन्छन्
+भन्छु
+भन्दा
+भन्दै
+भन्नुभयो
+भन्ने
+भन्या
+भयेन
+भयो
+भर
+भरि
+भरी
+भा
+भित्र
+भित्री
+भीत्र
+म
+मध्य
+मध्ये
+मलाई
+मा
+मात्र
+मात्रै
+माथि
+माथी
+मुख्य
+मुनि
+मुन्तिर
+मेरो
+मैले
+यति
+यथोचित
+यदि
+यद्ध्यपि
+यद्यपि
+यस
+यसका
+यसको
+यसपछि
+यसबाहेक
+यसमा
+यसरी
+यसले
+यसो
+यस्तै
+यस्तो
+यहाँ
+यहाँसम्म
+यही
+या
+यी
+यो
+र
+रही
+रहेका
+रहेको
+रहेछ
+राखे
+राख्छ
+राम्रो
+रुपमा
+रूप
+रे
+लगभग
+लगायत
+लाई
+लाख
+लागि
+लागेको
+ले
+वटा
+वरीपरी
+वा
+वाट
+वापत
+वास्तवमा
+शायद
+सक्छ
+सक्ने
+सँग
+संग
+सँगको
+सँगसँगै
+सँगै
+संगै
+सङ्ग
+सङ्गको
+सट्टा
+सत्र
+सधै
+सबै
+सबैको
+सबैलाई
+समय
+समेत
+सम्भव
+सम्म
+सय
+सरह
+सहित
+सहितै
+सही
+साँच्चै
+सात
+साथ
+साथै
+सायद
+सारा
+सुनेको
+सुनेर
+सुरु
+सुरुको
+सुरुमै
+सो
+सोचेको
+सोचेर
+सोही
+सोह्र
+स्थित
+स्पष्ट
+हजार
+हरे
+हरेक
+हामी
+हामीले
+हाम्रा
+हाम्रो
+हुँदैन
+हुन
+हुनत
+हुनु
+हुने
+हुनेछ
+हुन्
+हुन्छ
+हुन्थ्यो
+हैन
+हो
+होइन
+होकि
+होला
+""".split()
+)
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 1f13da5d6..91b7e4d9d 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -170,6 +170,11 @@ def nb_tokenizer():
     return get_lang_class("nb").Defaults.create_tokenizer()
 
 
+@pytest.fixture(scope="session")
+def ne_tokenizer():
+    return get_lang_class("ne").Defaults.create_tokenizer()
+
+
 @pytest.fixture(scope="session")
 def nl_tokenizer():
     return get_lang_class("nl").Defaults.create_tokenizer()
diff --git a/spacy/tests/lang/ne/__init__.py b/spacy/tests/lang/ne/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/ne/test_text.py b/spacy/tests/lang/ne/test_text.py
new file mode 100644
index 000000000..926a7de04
--- /dev/null
+++ b/spacy/tests/lang/ne/test_text.py
@@ -0,0 +1,19 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
+    text = """मैले पाएको सर्टिफिकेटलाई म त बोक्रो सम्झन्छु र अभ्यास तब सुरु भयो, जब मैले कलेज पार गरेँ र जीवनको पढाइ सुरु गरेँ ।"""
+    tokens = ne_tokenizer(text)
+    assert len(tokens) == 24
+
+
+@pytest.mark.parametrize(
+    "text,length",
+    [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)],
+)
+def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
+    tokens = ne_tokenizer(text)
+    assert len(tokens) == length
\ No newline at end of file

From 150a39ccca2426fcd10638c8515d7ec98cb79d8f Mon Sep 17 00:00:00 2001
From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com>
Date: Mon, 22 Jun 2020 21:32:25 +0900
Subject: [PATCH 03/43] Japanese model: add user_dict entries and small
 refactor (#5573)

* user_dict fields: adding inflections, reading_forms, sub_tokens
deleting: unidic_tags
improve code readability around the token alignment procedure

* add test cases, replace fugashi with sudachipy in conftest

* move bunsetu.py to spaCy Universe as a pipeline component BunsetuRecognizer

* tag is space -> both surface and tag are spaces

* consider len(text)==0
---
 spacy/lang/ja/__init__.py             | 203 +++++++++++++-------------
 spacy/lang/ja/bunsetu.py              | 144 ------------------
 spacy/tests/lang/ja/test_tokenizer.py |  53 ++++++-
 3 files changed, 152 insertions(+), 248 deletions(-)
 delete mode 100644 spacy/lang/ja/bunsetu.py

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index a7ad0846e..fb8b9d7fe 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -20,12 +20,7 @@ from ... import util
 
 
 # Hold the attributes we need with convenient names
-DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])
-
-# Handling for multiple spaces in a row is somewhat awkward, this simplifies
-# the flow by creating a dummy with the same interface.
-DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
-DummySpace = DummyNode(" ", " ", " ")
+DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"])
 
 
 def try_sudachi_import(split_mode="A"):
@@ -53,7 +48,7 @@ def try_sudachi_import(split_mode="A"):
         )
 
 
-def resolve_pos(orth, pos, next_pos):
+def resolve_pos(orth, tag, next_tag):
     """If necessary, add a field to the POS tag for UD mapping.
     Under Universal Dependencies, sometimes the same Unidic POS tag can
     be mapped differently depending on the literal token or its context
@@ -64,124 +59,77 @@ def resolve_pos(orth, pos, next_pos):
     # Some tokens have their UD tag decided based on the POS of the following
     # token.
 
-    # orth based rules
-    if pos[0] in TAG_ORTH_MAP:
-        orth_map = TAG_ORTH_MAP[pos[0]]
+    # apply orth based mapping
+    if tag in TAG_ORTH_MAP:
+        orth_map = TAG_ORTH_MAP[tag]
         if orth in orth_map:
-            return orth_map[orth], None
+            return orth_map[orth], None  # current_pos, next_pos
 
-    # tag bi-gram mapping
-    if next_pos:
-        tag_bigram = pos[0], next_pos[0]
+    # apply tag bi-gram mapping
+    if next_tag:
+        tag_bigram = tag, next_tag
         if tag_bigram in TAG_BIGRAM_MAP:
-            bipos = TAG_BIGRAM_MAP[tag_bigram]
-            if bipos[0] is None:
-                return TAG_MAP[pos[0]][POS], bipos[1]
+            current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram]
+            if current_pos is None:  # apply tag uni-gram mapping for current_pos
+                return TAG_MAP[tag][POS], next_pos  # only next_pos is identified by tag bi-gram mapping
             else:
-                return bipos
+                return current_pos, next_pos
 
-    return TAG_MAP[pos[0]][POS], None
+    # apply tag uni-gram mapping
+    return TAG_MAP[tag][POS], None
 
 
-# Use a mapping of paired punctuation to avoid splitting quoted sentences.
-pairpunct = {'「':'」', '『': '』', '【': '】'}
-
-
-def separate_sentences(doc):
-    """Given a doc, mark tokens that start sentences based on Unidic tags.
-    """
-
-    stack = [] # save paired punctuation
-
-    for i, token in enumerate(doc[:-2]):
-        # Set all tokens after the first to false by default. This is necessary
-        # for the doc code to be aware we've done sentencization, see
-        # `is_sentenced`.
-        token.sent_start = (i == 0)
-        if token.tag_:
-            if token.tag_ == "補助記号-括弧開":
-                ts = str(token)
-                if ts in pairpunct:
-                    stack.append(pairpunct[ts])
-                elif stack and ts == stack[-1]:
-                    stack.pop()
-
-            if token.tag_ == "補助記号-句点":
-                next_token = doc[i+1]
-                if next_token.tag_ != token.tag_ and not stack:
-                    next_token.sent_start = True
-
-
-def get_dtokens(tokenizer, text):
-    tokens = tokenizer.tokenize(text)
-    words = []
-    for ti, token in enumerate(tokens):
-        tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
-        inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
-        dtoken = DetailedToken(
-                token.surface(),
-                (tag, inf),
-                token.dictionary_form())
-        if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
-            # don't add multiple space tokens in a row
-            continue
-        words.append(dtoken)
-
-    # remove empty tokens. These can be produced with characters like … that
-    # Sudachi normalizes internally. 
-    words = [ww for ww in words if len(ww.surface) > 0]
-    return words
-
-
-def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
+def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
+    # Compare the content of tokens and text, first
     words = [x.surface for x in dtokens]
     if "".join("".join(words).split()) != "".join(text.split()):
         raise ValueError(Errors.E194.format(text=text, words=words))
-    text_words = []
-    text_lemmas = []
-    text_tags = []
+
+    text_dtokens = []
     text_spaces = []
     text_pos = 0
     # handle empty and whitespace-only texts
     if len(words) == 0:
-        return text_words, text_lemmas, text_tags, text_spaces
+        return text_dtokens, text_spaces
     elif len([word for word in words if not word.isspace()]) == 0:
         assert text.isspace()
-        text_words = [text]
-        text_lemmas = [text]
-        text_tags = [gap_tag]
+        text_dtokens = [DetailedToken(text, gap_tag, '', text, None, None)]
         text_spaces = [False]
-        return text_words, text_lemmas, text_tags, text_spaces
-    # normalize words to remove all whitespace tokens
-    norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
-    # align words with text
-    for word, dtoken in zip(norm_words, norm_dtokens):
+        return text_dtokens, text_spaces
+
+    # align words and dtokens by referring text, and insert gap tokens for the space char spans
+    for word, dtoken in zip(words, dtokens):
+        # skip all space tokens
+        if word.isspace():
+            continue
         try:
             word_start = text[text_pos:].index(word)
         except ValueError:
             raise ValueError(Errors.E194.format(text=text, words=words))
+
+        # space token
         if word_start > 0:
             w = text[text_pos:text_pos + word_start]
-            text_words.append(w)
-            text_lemmas.append(w)
-            text_tags.append(gap_tag)
+            text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
             text_spaces.append(False)
             text_pos += word_start
-        text_words.append(word)
-        text_lemmas.append(dtoken.lemma)
-        text_tags.append(dtoken.pos)
+
+        # content word
+        text_dtokens.append(dtoken)
         text_spaces.append(False)
         text_pos += len(word)
+        # poll a space char after the word
         if text_pos < len(text) and text[text_pos] == " ":
             text_spaces[-1] = True
             text_pos += 1
+
+    # trailing space token
     if text_pos < len(text):
         w = text[text_pos:]
-        text_words.append(w)
-        text_lemmas.append(w)
-        text_tags.append(gap_tag)
+        text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
         text_spaces.append(False)
-    return text_words, text_lemmas, text_tags, text_spaces
+
+    return text_dtokens, text_spaces
 
 
 class JapaneseTokenizer(DummyTokenizer):
@@ -191,29 +139,78 @@ class JapaneseTokenizer(DummyTokenizer):
         self.tokenizer = try_sudachi_import(self.split_mode)
 
     def __call__(self, text):
-        dtokens = get_dtokens(self.tokenizer, text)
+        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
+        sudachipy_tokens = self.tokenizer.tokenize(text)
+        dtokens = self._get_dtokens(sudachipy_tokens)
+        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
 
-        words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text)
+        # create Doc with tag bi-gram based part-of-speech identification rules
+        words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6
+        sub_tokens_list = list(sub_tokens_list)
         doc = Doc(self.vocab, words=words, spaces=spaces)
-        next_pos = None
-        for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)):
-            token.tag_ = unidic_tag[0]
-            if next_pos:
+        next_pos = None  # for bi-gram rules
+        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
+            token.tag_ = dtoken.tag
+            if next_pos:  # already identified in previous iteration
                 token.pos = next_pos
                 next_pos = None
             else:
                 token.pos, next_pos = resolve_pos(
                     token.orth_,
-                    unidic_tag,
-                    unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None
+                    dtoken.tag,
+                    tags[idx + 1] if idx + 1 < len(tags) else None
                 )
-
             # if there's no lemma info (it's an unk) just use the surface
-            token.lemma_ = lemma
-        doc.user_data["unidic_tags"] = unidic_tags
+            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
+
+        doc.user_data["inflections"] = inflections
+        doc.user_data["reading_forms"] = readings
+        doc.user_data["sub_tokens"] = sub_tokens_list
 
         return doc
 
+    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
+        sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
+        dtokens = [
+            DetailedToken(
+                token.surface(),  # orth
+                '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']),  # tag
+                ','.join([xx for xx in token.part_of_speech()[4:] if xx != '*']),  # inf
+                token.dictionary_form(),  # lemma
+                token.reading_form(),  # user_data['reading_forms']
+                sub_tokens_list[idx] if sub_tokens_list else None,  # user_data['sub_tokens']
+            ) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0
+            # remove empty tokens which can be produced with characters like … that
+        ]
+        # Sudachi normalizes internally and outputs each space char as a token.
+        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
+        return [
+            t for idx, t in enumerate(dtokens) if
+            idx == 0 or
+            not t.surface.isspace() or t.tag != '空白' or
+            not dtokens[idx - 1].surface.isspace() or dtokens[idx - 1].tag != '空白'
+        ]
+
+    def _get_sub_tokens(self, sudachipy_tokens):
+        if self.split_mode is None or self.split_mode == "A":  # do nothing for default split mode
+            return None
+
+        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
+        for token in sudachipy_tokens:
+            sub_a = token.split(self.tokenizer.SplitMode.A)
+            if len(sub_a) == 1:  # no sub tokens
+                sub_tokens_list.append(None)
+            elif self.split_mode == "B":
+                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
+            else:  # "C"
+                sub_b = token.split(self.tokenizer.SplitMode.B)
+                if len(sub_a) == len(sub_b):
+                    dtokens = self._get_dtokens(sub_a, False)
+                    sub_tokens_list.append([dtokens, dtokens])
+                else:
+                    sub_tokens_list.append([self._get_dtokens(sub_a, False), self._get_dtokens(sub_b, False)])
+        return sub_tokens_list
+
     def _get_config(self):
         config = OrderedDict(
             (
diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py
deleted file mode 100644
index 7c3eee336..000000000
--- a/spacy/lang/ja/bunsetu.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from .stop_words import STOP_WORDS
-
-
-POS_PHRASE_MAP = {
-    "NOUN": "NP",
-    "NUM": "NP",
-    "PRON": "NP",
-    "PROPN": "NP",
-
-    "VERB": "VP",
-
-    "ADJ": "ADJP",
-
-    "ADV": "ADVP",
-
-    "CCONJ": "CCONJP",
-}
-
-
-# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)]
-def yield_bunsetu(doc, debug=False):
-    bunsetu = []
-    bunsetu_may_end = False
-    phrase_type = None
-    phrase = None
-    prev = None
-    prev_tag = None
-    prev_dep = None
-    prev_head = None
-    for t in doc:
-        pos = t.pos_
-        pos_type = POS_PHRASE_MAP.get(pos, None)
-        tag = t.tag_
-        dep = t.dep_
-        head = t.head.i
-        if debug:
-            print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
-
-        # DET is always an individual bunsetu
-        if pos == "DET":
-            if bunsetu:
-                yield bunsetu, phrase_type, phrase
-            yield [t], None, None
-            bunsetu = []
-            bunsetu_may_end = False
-            phrase_type = None
-            phrase = None
-
-        # PRON or Open PUNCT always splits bunsetu
-        elif tag == "補助記号-括弧開":
-            if bunsetu:
-                yield bunsetu, phrase_type, phrase
-            bunsetu = [t]
-            bunsetu_may_end = True
-            phrase_type = None
-            phrase = None
-
-        # bunsetu head not appeared
-        elif phrase_type is None:
-            if bunsetu and prev_tag == "補助記号-読点":
-                yield bunsetu, phrase_type, phrase
-                bunsetu = []
-                bunsetu_may_end = False
-                phrase_type = None
-                phrase = None
-            bunsetu.append(t)
-            if pos_type:  # begin phrase
-                phrase = [t]
-                phrase_type = pos_type
-                if pos_type in {"ADVP", "CCONJP"}:
-                    bunsetu_may_end = True
-
-        # entering new bunsetu
-        elif pos_type and (
-            pos_type != phrase_type or  # different phrase type arises
-            bunsetu_may_end  # same phrase type but bunsetu already ended
-        ):
-            # exceptional case: NOUN to VERB
-            if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
-                bunsetu.append(t)
-                phrase_type = "VP"
-                phrase.append(t)
-            # exceptional case: VERB to NOUN
-            elif phrase_type == "VP" and pos_type == "NP" and (
-                    prev_dep == 'compound' and prev_head == t.i or
-                    dep == 'compound' and prev == head or
-                    prev_dep == 'nmod' and prev_head == t.i
-            ):
-                bunsetu.append(t)
-                phrase_type = "NP"
-                phrase.append(t)
-            else:
-                yield bunsetu, phrase_type, phrase
-                bunsetu = [t]
-                bunsetu_may_end = False
-                phrase_type = pos_type
-                phrase = [t]
-
-        # NOUN bunsetu
-        elif phrase_type == "NP":
-            bunsetu.append(t)
-            if not bunsetu_may_end and ((
-                (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
-            ) or (
-                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
-            )):
-                phrase.append(t)
-            else:
-                bunsetu_may_end = True
-
-        # VERB bunsetu
-        elif phrase_type == "VP":
-            bunsetu.append(t)
-            if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
-                phrase.append(t)
-            else:
-                bunsetu_may_end = True
-
-        # ADJ bunsetu
-        elif phrase_type == "ADJP" and tag != '連体詞':
-            bunsetu.append(t)
-            if not bunsetu_may_end and ((
-                pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
-            ) or (
-                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
-            )):
-                phrase.append(t)
-            else:
-                bunsetu_may_end = True
-
-        # other bunsetu
-        else:
-            bunsetu.append(t)
-
-        prev = t.i
-        prev_tag = t.tag_
-        prev_dep = t.dep_
-        prev_head = head
-
-    if bunsetu:
-        yield bunsetu, phrase_type, phrase
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index 26be5cf59..651e906eb 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import pytest
 
 from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
-from spacy.lang.ja import Japanese
+from spacy.lang.ja import Japanese, DetailedToken
 
 # fmt: off
 TOKENIZER_TESTS = [
@@ -96,6 +96,57 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
     assert len(nlp_c(text)) == len_c
 
 
+@pytest.mark.parametrize("text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c",
+    [
+        (
+            "選挙管理委員会",
+            [None, None, None, None],
+            [None, None, [
+                [
+                    DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None),
+                    DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None),
+                ]
+            ]],
+            [[
+                [
+                    DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None),
+                    DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None),
+                    DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None),
+                    DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None),
+                ], [
+                    DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None),
+                    DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None),
+                    DetailedToken(surface='委員会', tag='名詞-普通名詞-一般', inf='', lemma='委員会', reading='イインカイ', sub_tokens=None),
+                ]
+            ]]
+        ),
+    ]
+)
+def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c):
+    nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
+    nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
+    nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})
+
+    assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
+    assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
+    assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
+    assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
+
+
+@pytest.mark.parametrize("text,inflections,reading_forms",
+    [
+        (
+            "取ってつけた",
+            ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
+            ("トッ", "テ", "ツケ", "タ"),
+        ),
+    ]
+)
+def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms):
+    assert ja_tokenizer(text).user_data["inflections"] == inflections
+    assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
+
+
 def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
     doc = ja_tokenizer("")
     assert len(doc) == 0

From bc1cb30b2157b2e3fe63ec42dad9650cb369e3c3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 22 Jun 2020 14:37:24 +0200
Subject: [PATCH 04/43] Add warnings example in v2.3 migration guide (#5627)

---
 website/docs/usage/v2-3.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md
index d59b50a6e..378b1ec34 100644
--- a/website/docs/usage/v2-3.md
+++ b/website/docs/usage/v2-3.md
@@ -161,10 +161,18 @@ debugging your tokenizer configuration.
 
 spaCy's custom warnings have been replaced with native Python
 [`warnings`](https://docs.python.org/3/library/warnings.html). Instead of
-setting `SPACY_WARNING_IGNORE`, use the
-[`warnings` filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter)
+setting `SPACY_WARNING_IGNORE`, use the [`warnings`
+filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter)
 to manage warnings.
 
+```diff
+import spacy
++ import warnings
+
+- spacy.errors.SPACY_WARNING_IGNORE.append('W007')
++ warnings.filterwarnings("ignore", message=r"\[W007\]", category=UserWarning)
+```
+
 #### Normalization tables
 
 The normalization tables have moved from the language data in

From 0ef78bad93a5ebaca783693cdf1e948e15cd7a86 Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Mon, 22 Jun 2020 23:53:58 -0700
Subject: [PATCH 05/43] contribute (#5632)

---
 .github/contributors/richardliaw.md | 106 ++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/richardliaw.md

diff --git a/.github/contributors/richardliaw.md b/.github/contributors/richardliaw.md
new file mode 100644
index 000000000..2af4ce840
--- /dev/null
+++ b/.github/contributors/richardliaw.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Richard Liaw         |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 06/22/2020           |
+| GitHub username                | richardliaw          |
+| Website (optional)             |                      |
\ No newline at end of file

From d94e961f14af61dba4f01e0e2821217f38b85fbf Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 23 Jun 2020 13:29:51 +0200
Subject: [PATCH 06/43] Fix polarity of Token.is_oov and Lexeme.is_oov (#5634)

Fix `Token.is_oov` and `Lexeme.is_oov` so they return `True` when the
lexeme does **not** have a vector.
---
 spacy/lexeme.pyx                          | 2 +-
 spacy/tests/vocab_vectors/test_vectors.py | 6 +++---
 spacy/tokens/token.pyx                    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 1df516dcb..8042098d7 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -349,7 +349,7 @@ cdef class Lexeme:
     @property
     def is_oov(self):
         """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
-        return self.orth in self.vocab.vectors
+        return self.orth not in self.vocab.vectors
 
     property is_stop:
         """RETURNS (bool): Whether the lexeme is a stop word."""
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 576ca93d2..b31cef1f2 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -376,6 +376,6 @@ def test_vector_is_oov():
     data[1] = 2.0
     vocab.set_vector("cat", data[0])
     vocab.set_vector("dog", data[1])
-    assert vocab["cat"].is_oov is True
-    assert vocab["dog"].is_oov is True
-    assert vocab["hamster"].is_oov is False
+    assert vocab["cat"].is_oov is False
+    assert vocab["dog"].is_oov is False
+    assert vocab["hamster"].is_oov is True
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 45deebc93..8d3406bae 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -923,7 +923,7 @@ cdef class Token:
     @property
     def is_oov(self):
         """RETURNS (bool): Whether the token is out-of-vocabulary."""
-        return self.c.lex.orth in self.vocab.vectors
+        return self.c.lex.orth not in self.vocab.vectors
 
     @property
     def is_stop(self):

From 7ce451c211ef8c528a14ba4c5da3c380e534c350 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 23 Jun 2020 16:48:59 +0200
Subject: [PATCH 07/43] Extend what's new in v2.3 with vocab / is_oov (#5635)

---
 website/docs/usage/v2-3.md | 45 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md
index 378b1ec34..c56b44267 100644
--- a/website/docs/usage/v2-3.md
+++ b/website/docs/usage/v2-3.md
@@ -182,6 +182,51 @@ If you're adding data for a new language, the normalization table should be
 added to `spacy-lookups-data`. See
 [adding norm exceptions](/usage/adding-languages#norm-exceptions).
 
+#### No preloaded lexemes/vocab for models with vectors
+
+To reduce the initial loading time, the lexemes in `nlp.vocab` are no longer
+loaded on initialization for models with vectors. As you process texts, the
+lexemes will be added to the vocab automatically, just as in models without
+vectors.
+
+To see the number of unique vectors and number of words with vectors, see
+`nlp.meta['vectors']`, for example for `en_core_web_md` there are `20000`
+unique vectors and `684830` words with vectors:
+
+```python
+{
+    'width': 300,
+    'vectors': 20000,
+    'keys': 684830,
+    'name': 'en_core_web_md.vectors'
+}
+```
+
+If required, for instance if you are working directly with word vectors rather
+than processing texts, you can load all lexemes for words with vectors at once:
+
+```python
+for orth in nlp.vocab.vectors:
+    _ = nlp.vocab[orth]
+```
+
+#### Lexeme.is_oov and Token.is_oov
+
+<Infobox title="Important note" variant="warning">
+
+Due to a bug, the values for `is_oov` are reversed in v2.3.0, but this will be
+fixed in the next patch release v2.3.1.
+
+</Infobox>
+
+In v2.3, `Lexeme.is_oov` and `Token.is_oov` are `True` if the lexeme does not
+have a word vector. This is equivalent to `token.orth not in
+nlp.vocab.vectors`.
+
+Previously in v2.2, `is_oov` corresponded to whether a lexeme had stored
+probability and cluster features. The probability and cluster features are no
+longer included in the provided medium and large models (see the next section).
+
 #### Probability and cluster features
 
 > #### Load and save extra prob lookups table

From 6fe6e761de836550aa71105e1dfd75335612fa82 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 23 Jun 2020 23:21:11 +0200
Subject: [PATCH 08/43] Skip vocab in component config overrides (#5624)

---
 spacy/util.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/util.py b/spacy/util.py
index 5362952e2..923f56b31 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -208,6 +208,10 @@ def load_model_from_path(model_path, meta=False, **overrides):
         pipeline = nlp.Defaults.pipe_names
     elif pipeline in (False, None):
         pipeline = []
+    # skip "vocab" from overrides in component initialization since vocab is
+    # already configured from overrides when nlp is initialized above
+    if "vocab" in overrides:
+        del overrides["vocab"]
     for name in pipeline:
         if name not in disable:
             config = meta.get("pipeline_args", {}).get(name, {})

From fd4287c178feea0ab4a50e70c86f4583c9b886c6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 24 Jun 2020 10:26:12 +0200
Subject: [PATCH 09/43] Fix backslashes in warnings config diff (#5640)

Fix backslashes in warnings config diff in v2.3 migration section.
---
 website/docs/usage/v2-3.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md
index c56b44267..e6b88c779 100644
--- a/website/docs/usage/v2-3.md
+++ b/website/docs/usage/v2-3.md
@@ -170,7 +170,7 @@ import spacy
 + import warnings
 
 - spacy.errors.SPACY_WARNING_IGNORE.append('W007')
-+ warnings.filterwarnings("ignore", message=r"\[W007\]", category=UserWarning)
++ warnings.filterwarnings("ignore", message=r"\\[W007\\]", category=UserWarning)
 ```
 
 #### Normalization tables

From b7107ac89feee7f1aa1381d3c2978d09919288c2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 26 Jun 2020 09:23:21 +0200
Subject: [PATCH 10/43] Disregard special tag  _SP in check for new tag map
 (#5641)

* Skip special tag  _SP in check for new tag map

In `Tagger.begin_training()` check for new tags aside from `_SP` in the
new tag map initialized from the provided gold tuples when determining
whether to reinitialize the morphology with the new tag map.

* Simplify _SP check
---
 spacy/pipeline/pipes.pyx            |  4 ++--
 spacy/tests/pipeline/test_tagger.py | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 3f40cb545..8f07bf8f7 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -528,10 +528,10 @@ class Tagger(Pipe):
                         new_tag_map[tag] = orig_tag_map[tag]
                     else:
                         new_tag_map[tag] = {POS: X}
-        if "_SP" in orig_tag_map:
-            new_tag_map["_SP"] = orig_tag_map["_SP"]
         cdef Vocab vocab = self.vocab
         if new_tag_map:
+            if "_SP" in orig_tag_map:
+                new_tag_map["_SP"] = orig_tag_map["_SP"]
             vocab.morphology = Morphology(vocab.strings, new_tag_map,
                                           vocab.morphology.lemmatizer,
                                           exc=vocab.morphology.exc)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index a5bda9090..1681ffeaa 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 
 import pytest
 from spacy.language import Language
+from spacy.symbols import POS, NOUN
 
 
 def test_label_types():
@@ -11,3 +12,16 @@ def test_label_types():
     nlp.get_pipe("tagger").add_label("A")
     with pytest.raises(ValueError):
         nlp.get_pipe("tagger").add_label(9)
+
+
+def test_tagger_begin_training_tag_map():
+    """Test that Tagger.begin_training() without gold tuples does not clobber
+    the tag map."""
+    nlp = Language()
+    tagger = nlp.create_pipe("tagger")
+    orig_tag_count = len(tagger.labels)
+    tagger.add_label("A", {"POS": "NOUN"})
+    nlp.add_pipe(tagger)
+    nlp.begin_training()
+    assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN}
+    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)

From 90c7eb0e2f51eb07582c4d9e3fcaed1fdb51c4bc Mon Sep 17 00:00:00 2001
From: PluieElectrique <41453973+PluieElectrique@users.noreply.github.com>
Date: Fri, 26 Jun 2020 12:09:10 +0000
Subject: [PATCH 11/43] Reduce memory usage of Lookup's BloomFilter (#5606)

* Reduce memory usage of Lookup's BloomFilter

* Remove extra Table update
---
 .github/contributors/PluieElectrique.md | 106 ++++++++++++++++++++++++
 spacy/lookups.py                        |   5 +-
 2 files changed, 108 insertions(+), 3 deletions(-)
 create mode 100644 .github/contributors/PluieElectrique.md

diff --git a/.github/contributors/PluieElectrique.md b/.github/contributors/PluieElectrique.md
new file mode 100644
index 000000000..97e01650a
--- /dev/null
+++ b/.github/contributors/PluieElectrique.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [X] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Pluie                |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-06-18           |
+| GitHub username                | PluieElectrique      |
+| Website (optional)             |                      |
diff --git a/spacy/lookups.py b/spacy/lookups.py
index 1fa29bdfe..d4947be9f 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -120,8 +120,7 @@ class Lookups(object):
         """
         self._tables = OrderedDict()
         for key, value in srsly.msgpack_loads(bytes_data).items():
-            self._tables[key] = Table(key)
-            self._tables[key].update(value)
+            self._tables[key] = Table(key, value)
         return self
 
     def to_disk(self, path, filename="lookups.bin", **kwargs):
@@ -192,7 +191,7 @@ class Table(OrderedDict):
         self.name = name
         # Assume a default size of 1M items
         self.default_size = 1e6
-        size = len(data) if data and len(data) > 0 else self.default_size
+        size = max(len(data), 1) if data is not None else self.default_size
         self.bloom = BloomFilter.from_error_rate(size)
         if data:
             self.update(data)

From c4d02094726a7e92325f9fc0911fcfad7f43db75 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 26 Jun 2020 14:12:29 +0200
Subject: [PATCH 12/43] Extend v2.3 migration guide (#5653)

* Extend preloaded vocab section

* Add section on tag maps
---
 website/docs/usage/v2-3.md | 78 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md
index e6b88c779..b6c4d7dfb 100644
--- a/website/docs/usage/v2-3.md
+++ b/website/docs/usage/v2-3.md
@@ -182,12 +182,12 @@ If you're adding data for a new language, the normalization table should be
 added to `spacy-lookups-data`. See
 [adding norm exceptions](/usage/adding-languages#norm-exceptions).
 
-#### No preloaded lexemes/vocab for models with vectors
+#### No preloaded vocab for models with vectors
 
 To reduce the initial loading time, the lexemes in `nlp.vocab` are no longer
 loaded on initialization for models with vectors. As you process texts, the
-lexemes will be added to the vocab automatically, just as in models without
-vectors.
+lexemes will be added to the vocab automatically, just as in small models
+without vectors.
 
 To see the number of unique vectors and number of words with vectors, see
 `nlp.meta['vectors']`, for example for `en_core_web_md` there are `20000`
@@ -210,6 +210,20 @@ for orth in nlp.vocab.vectors:
     _ = nlp.vocab[orth]
 ```
 
+If your workflow previously iterated over `nlp.vocab`, a similar alternative
+is to iterate over words with vectors instead:
+
+```diff
+- lexemes = [w for w in nlp.vocab]
++ lexemes = [nlp.vocab[orth] for orth in nlp.vocab.vectors]
+```
+
+Be aware that the set of preloaded lexemes in a v2.2 model is not equivalent to
+the set of words with vectors. For English, v2.2 `md/lg` models have 1.3M
+provided lexemes but only 685K words with vectors. The vectors have been
+updated for most languages in v2.2, but the English models contain the same
+vectors for both v2.2 and v2.3.
+
 #### Lexeme.is_oov and Token.is_oov
 
 <Infobox title="Important note" variant="warning">
@@ -254,6 +268,28 @@ model vocab, which will take a few seconds on initial loading. When you save
 this model after loading the `prob` table, the full `prob` table will be saved
 as part of the model vocab.
 
+To load the probability table into a provided model, first make sure you have
+`spacy-lookups-data` installed. To load the table, remove the empty provided
+`lexeme_prob` table and then access `Lexeme.prob` for any word to load the
+table from `spacy-lookups-data`:
+
+```diff
++ # prerequisite: pip install spacy-lookups-data
+import spacy
+
+nlp = spacy.load("en_core_web_md")
+
+# remove the empty placeholder prob table
++ if nlp.vocab.lookups_extra.has_table("lexeme_prob"):
++     nlp.vocab.lookups_extra.remove_table("lexeme_prob")
+
+# access any `.prob` to load the full table into the model
+assert nlp.vocab["a"].prob == -3.9297883511
+
+# if desired, save this model with the probability table included
+nlp.to_disk("/path/to/model")
+```
+
 If you'd like to include custom `cluster`, `prob`, or `sentiment` tables as part
 of a new model, add the data to
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) under
@@ -271,3 +307,39 @@ When you initialize a new model with [`spacy init-model`](/api/cli#init-model),
 the `prob` table from `spacy-lookups-data` may be loaded as part of the
 initialization. If you'd like to omit this extra data as in spaCy's provided
 v2.3 models, use the new flag `--omit-extra-lookups`.
+
+#### Tag maps in provided models vs. blank models
+
+The tag maps in the provided models may differ from the tag maps in the spaCy
+library. You can access the tag map in a loaded model under
+`nlp.vocab.morphology.tag_map`.
+
+The tag map from `spacy.lang.lg.tag_map` is still used when a blank model is
+initialized. If you want to provide an alternate tag map, update
+`nlp.vocab.morphology.tag_map` after initializing the model or if you're using
+the [train CLI](/api/cli#train), you can use the new `--tag-map-path` option to
+provide in the tag map as a JSON dict.
+
+If you want to export a tag map from a provided model for use with the train
+CLI, you can save it as a JSON dict. To only use string keys as required by
+JSON and to make it easier to read and edit, any internal integer IDs need to
+be converted back to strings:
+
+```python
+import spacy
+import srsly
+
+nlp = spacy.load("en_core_web_sm")
+tag_map = {}
+
+# convert any integer IDs to strings for JSON
+for tag, morph in nlp.vocab.morphology.tag_map.items():
+    tag_map[tag] = {}
+    for feat, val in morph.items():
+        feat = nlp.vocab.strings.as_string(feat)
+        if not isinstance(val, bool):
+            val = nlp.vocab.strings.as_string(val)
+        tag_map[tag][feat] = val
+
+srsly.write_json("tag_map.json", tag_map)
+```

From 167df42cb6bdda3edf05cbef44f0edb9b73f05f1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Jun 2020 14:16:57 +0200
Subject: [PATCH 13/43] Move lemmatizer is_base_form to language settings
 (#5663)

Move `Lemmatizer.is_base_form` to the language settings so that each
language can provide a language-specific method as
`LanguageDefaults.is_base_form`.

The existing English-specific `Lemmatizer.is_base_form` is moved to
`EnglishDefaults`.
---
 spacy/lang/en/__init__.py                  | 36 ++++++++++++++++++++
 spacy/language.py                          |  3 +-
 spacy/lemmatizer.py                        | 39 ++--------------------
 spacy/tests/regression/test_issue1-1000.py |  3 +-
 spacy/tests/test_lemmatizer.py             | 12 +++++++
 5 files changed, 55 insertions(+), 38 deletions(-)

diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index 4304b3c6a..d52f3dfd8 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -18,6 +18,41 @@ def _return_en(_):
     return "en"
 
 
+def en_is_base_form(univ_pos, morphology=None):
+    """
+    Check whether we're dealing with an uninflected paradigm, so we can
+    avoid lemmatization entirely.
+
+    univ_pos (unicode / int): The token's universal part-of-speech tag.
+    morphology (dict): The token's morphological features following the
+        Universal Dependencies scheme.
+    """
+    if morphology is None:
+        morphology = {}
+    if univ_pos == "noun" and morphology.get("Number") == "sing":
+        return True
+    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
+        return True
+    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
+    # morphology
+    elif univ_pos == "verb" and (
+        morphology.get("VerbForm") == "fin"
+        and morphology.get("Tense") == "pres"
+        and morphology.get("Number") is None
+    ):
+        return True
+    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
+        return True
+    elif morphology.get("VerbForm") == "inf":
+        return True
+    elif morphology.get("VerbForm") == "none":
+        return True
+    elif morphology.get("Degree") == "pos":
+        return True
+    else:
+        return False
+
+
 class EnglishDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters.update(LEX_ATTRS)
@@ -26,6 +61,7 @@ class EnglishDefaults(Language.Defaults):
     tag_map = TAG_MAP
     stop_words = STOP_WORDS
     morph_rules = MORPH_RULES
+    is_base_form = en_is_base_form
     syntax_iterators = SYNTAX_ITERATORS
     single_orth_variants = [
         {"tags": ["NFP"], "variants": ["…", "..."]},
diff --git a/spacy/language.py b/spacy/language.py
index 2058def8a..faa0447a4 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -46,7 +46,7 @@ class BaseDefaults(object):
     def create_lemmatizer(cls, nlp=None, lookups=None):
         if lookups is None:
             lookups = cls.create_lookups(nlp=nlp)
-        return Lemmatizer(lookups=lookups)
+        return Lemmatizer(lookups=lookups, is_base_form=cls.is_base_form)
 
     @classmethod
     def create_lookups(cls, nlp=None):
@@ -120,6 +120,7 @@ class BaseDefaults(object):
     tokenizer_exceptions = {}
     stop_words = set()
     morph_rules = {}
+    is_base_form = None
     lex_attr_getters = LEX_ATTRS
     syntax_iterators = {}
     resources = {}
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 1f0f0da3f..f72eae128 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -21,7 +21,7 @@ class Lemmatizer(object):
     def load(cls, *args, **kwargs):
         raise NotImplementedError(Errors.E172)
 
-    def __init__(self, lookups, *args, **kwargs):
+    def __init__(self, lookups, *args, is_base_form=None, **kwargs):
         """Initialize a Lemmatizer.
 
         lookups (Lookups): The lookups object containing the (optional) tables
@@ -31,6 +31,7 @@ class Lemmatizer(object):
         if args or kwargs or not isinstance(lookups, Lookups):
             raise ValueError(Errors.E173)
         self.lookups = lookups
+        self.is_base_form = is_base_form
 
     def __call__(self, string, univ_pos, morphology=None):
         """Lemmatize a string.
@@ -51,7 +52,7 @@ class Lemmatizer(object):
         if univ_pos in ("", "eol", "space"):
             return [string.lower()]
         # See Issue #435 for example of where this logic is requied.
-        if self.is_base_form(univ_pos, morphology):
+        if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
             return [string.lower()]
         index_table = self.lookups.get_table("lemma_index", {})
         exc_table = self.lookups.get_table("lemma_exc", {})
@@ -69,40 +70,6 @@ class Lemmatizer(object):
         )
         return lemmas
 
-    def is_base_form(self, univ_pos, morphology=None):
-        """
-        Check whether we're dealing with an uninflected paradigm, so we can
-        avoid lemmatization entirely.
-
-        univ_pos (unicode / int): The token's universal part-of-speech tag.
-        morphology (dict): The token's morphological features following the
-            Universal Dependencies scheme.
-        """
-        if morphology is None:
-            morphology = {}
-        if univ_pos == "noun" and morphology.get("Number") == "sing":
-            return True
-        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
-            return True
-        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
-        # morphology
-        elif univ_pos == "verb" and (
-            morphology.get("VerbForm") == "fin"
-            and morphology.get("Tense") == "pres"
-            and morphology.get("Number") is None
-        ):
-            return True
-        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
-            return True
-        elif morphology.get("VerbForm") == "inf":
-            return True
-        elif morphology.get("VerbForm") == "none":
-            return True
-        elif morphology.get("Degree") == "pos":
-            return True
-        else:
-            return False
-
     def noun(self, string, morphology=None):
         return self(string, "noun", morphology)
 
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index 6d88d68c2..38a99371e 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -11,6 +11,7 @@ from spacy.language import Language
 from spacy.lemmatizer import Lemmatizer
 from spacy.lookups import Lookups
 from spacy.tokens import Doc, Span
+from spacy.lang.en import EnglishDefaults
 
 from ..util import get_doc, make_tempdir
 
@@ -172,7 +173,7 @@ def test_issue595():
     lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
     lookups.add_table("lemma_index", {"verb": {}})
     lookups.add_table("lemma_exc", {"verb": {}})
-    lemmatizer = Lemmatizer(lookups)
+    lemmatizer = Lemmatizer(lookups, is_base_form=EnglishDefaults.is_base_form)
     vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
     doc = Doc(vocab, words=words)
     doc[2].tag_ = "VB"
diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py
index fce3772c4..e7736b042 100644
--- a/spacy/tests/test_lemmatizer.py
+++ b/spacy/tests/test_lemmatizer.py
@@ -5,6 +5,7 @@ import pytest
 from spacy.tokens import Doc
 from spacy.language import Language
 from spacy.lookups import Lookups
+from spacy.lemmatizer import Lemmatizer
 
 
 def test_lemmatizer_reflects_lookups_changes():
@@ -47,3 +48,14 @@ def test_tagger_warns_no_lookups():
     with pytest.warns(None) as record:
         nlp.begin_training()
         assert not record.list
+
+
+def test_lemmatizer_without_is_base_form_implementation():
+    # Norwegian example from #5658
+    lookups = Lookups()
+    lookups.add_table("lemma_rules", {"noun": []})
+    lookups.add_table("lemma_index", {"noun": {}})
+    lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}})
+
+    lemmatizer = Lemmatizer(lookups, is_base_form=None)
+    assert lemmatizer("Formuesskatten", "noun", {'Definite': 'def', 'Gender': 'masc', 'Number': 'sing'}) == ["formuesskatt"]

From 1dd38191ecf684caa967e54e70452f5150551de5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Jun 2020 14:20:26 +0200
Subject: [PATCH 14/43] Convert custom user_data to token extension format for
 Japanese tokenizer (#5652)

* Convert custom user_data to token extension format

Convert the user_data values so that they can be loaded as custom token
extensions for `inflection`, `reading_form`, `sub_tokens`, and `lemma`.

* Reset Underscore state in ja tokenizer tests
---
 spacy/lang/ja/__init__.py             | 13 +++++-----
 spacy/tests/lang/ja/test_tokenizer.py | 37 +++++++++++++++++++++------
 2 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index fb8b9d7fe..f356f3d64 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -145,8 +145,7 @@ class JapaneseTokenizer(DummyTokenizer):
         dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
 
         # create Doc with tag bi-gram based part-of-speech identification rules
-        words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6
-        sub_tokens_list = list(sub_tokens_list)
+        words = [dtoken.surface for dtoken in dtokens]
         doc = Doc(self.vocab, words=words, spaces=spaces)
         next_pos = None  # for bi-gram rules
         for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
@@ -158,14 +157,14 @@ class JapaneseTokenizer(DummyTokenizer):
                 token.pos, next_pos = resolve_pos(
                     token.orth_,
                     dtoken.tag,
-                    tags[idx + 1] if idx + 1 < len(tags) else None
+                    dtokens[idx + 1].tag if idx + 1 < len(dtokens) else None
                 )
             # if there's no lemma info (it's an unk) just use the surface
             token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
-
-        doc.user_data["inflections"] = inflections
-        doc.user_data["reading_forms"] = readings
-        doc.user_data["sub_tokens"] = sub_tokens_list
+            doc.user_data[('._.', 'inflection', token.idx, None)] = dtoken.inf
+            doc.user_data[('._.', 'reading_form', token.idx, None)] = dtoken.reading
+            doc.user_data[('._.', 'sub_tokens', token.idx, None)] = dtoken.sub_tokens
+            doc.user_data[('._.', 'lemma', token.idx, None)] = token.lemma_
 
         return doc
 
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index 651e906eb..fad5e1390 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -5,6 +5,18 @@ import pytest
 
 from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
 from spacy.lang.ja import Japanese, DetailedToken
+from spacy.tokens import Token
+from spacy.tokens.underscore import Underscore
+
+
+@pytest.fixture(scope="function", autouse=True)
+def clean_underscore():
+    # reset the Underscore object after the test, to avoid having state copied across tests
+    yield
+    Underscore.doc_extensions = {}
+    Underscore.span_extensions = {}
+    Underscore.token_extensions = {}
+
 
 # fmt: off
 TOKENIZER_TESTS = [
@@ -127,24 +139,33 @@ def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_toke
     nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
     nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})
 
-    assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
-    assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
-    assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
-    assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
+    doc = ja_tokenizer(text)
+    doc_a = nlp_a(text)
+    doc_b = nlp_b(text)
+    doc_c = nlp_c(text)
+
+    Token.set_extension("sub_tokens", default="")
+    assert [t._.sub_tokens for t in doc] == sub_tokens_list_a
+    assert [t._.sub_tokens for t in doc_a] == sub_tokens_list_a
+    assert [t._.sub_tokens for t in doc_b] == sub_tokens_list_b
+    assert [t._.sub_tokens for t in doc_c] == sub_tokens_list_c
 
 
 @pytest.mark.parametrize("text,inflections,reading_forms",
     [
         (
             "取ってつけた",
-            ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
-            ("トッ", "テ", "ツケ", "タ"),
+            ["五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"],
+            ["トッ", "テ", "ツケ", "タ"],
         ),
     ]
 )
 def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms):
-    assert ja_tokenizer(text).user_data["inflections"] == inflections
-    assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
+    Token.set_extension("inflection", default="")
+    Token.set_extension("reading_form", default="")
+    doc = ja_tokenizer(text)
+    assert [t._.inflection for t in doc] == inflections
+    assert [t._.reading_form for t in doc] == reading_forms
 
 
 def test_ja_tokenizer_emptyish_texts(ja_tokenizer):

From 2d715451a2215bf589da5e7d2c7a0234d05cbbc8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 29 Jun 2020 14:34:15 +0200
Subject: [PATCH 15/43] Revert "Convert custom user_data to token extension
 format for Japanese tokenizer (#5652)" (#5665)

This reverts commit 1dd38191ecf684caa967e54e70452f5150551de5.
---
 spacy/lang/ja/__init__.py             | 13 +++++-----
 spacy/tests/lang/ja/test_tokenizer.py | 37 ++++++---------------------
 2 files changed, 15 insertions(+), 35 deletions(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index f356f3d64..fb8b9d7fe 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -145,7 +145,8 @@ class JapaneseTokenizer(DummyTokenizer):
         dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
 
         # create Doc with tag bi-gram based part-of-speech identification rules
-        words = [dtoken.surface for dtoken in dtokens]
+        words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6
+        sub_tokens_list = list(sub_tokens_list)
         doc = Doc(self.vocab, words=words, spaces=spaces)
         next_pos = None  # for bi-gram rules
         for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
@@ -157,14 +158,14 @@ class JapaneseTokenizer(DummyTokenizer):
                 token.pos, next_pos = resolve_pos(
                     token.orth_,
                     dtoken.tag,
-                    dtokens[idx + 1].tag if idx + 1 < len(dtokens) else None
+                    tags[idx + 1] if idx + 1 < len(tags) else None
                 )
             # if there's no lemma info (it's an unk) just use the surface
             token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
-            doc.user_data[('._.', 'inflection', token.idx, None)] = dtoken.inf
-            doc.user_data[('._.', 'reading_form', token.idx, None)] = dtoken.reading
-            doc.user_data[('._.', 'sub_tokens', token.idx, None)] = dtoken.sub_tokens
-            doc.user_data[('._.', 'lemma', token.idx, None)] = token.lemma_
+
+        doc.user_data["inflections"] = inflections
+        doc.user_data["reading_forms"] = readings
+        doc.user_data["sub_tokens"] = sub_tokens_list
 
         return doc
 
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index fad5e1390..651e906eb 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -5,18 +5,6 @@ import pytest
 
 from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
 from spacy.lang.ja import Japanese, DetailedToken
-from spacy.tokens import Token
-from spacy.tokens.underscore import Underscore
-
-
-@pytest.fixture(scope="function", autouse=True)
-def clean_underscore():
-    # reset the Underscore object after the test, to avoid having state copied across tests
-    yield
-    Underscore.doc_extensions = {}
-    Underscore.span_extensions = {}
-    Underscore.token_extensions = {}
-
 
 # fmt: off
 TOKENIZER_TESTS = [
@@ -139,33 +127,24 @@ def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_toke
     nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
     nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})
 
-    doc = ja_tokenizer(text)
-    doc_a = nlp_a(text)
-    doc_b = nlp_b(text)
-    doc_c = nlp_c(text)
-
-    Token.set_extension("sub_tokens", default="")
-    assert [t._.sub_tokens for t in doc] == sub_tokens_list_a
-    assert [t._.sub_tokens for t in doc_a] == sub_tokens_list_a
-    assert [t._.sub_tokens for t in doc_b] == sub_tokens_list_b
-    assert [t._.sub_tokens for t in doc_c] == sub_tokens_list_c
+    assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
+    assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
+    assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
+    assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
 
 
 @pytest.mark.parametrize("text,inflections,reading_forms",
     [
         (
             "取ってつけた",
-            ["五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"],
-            ["トッ", "テ", "ツケ", "タ"],
+            ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
+            ("トッ", "テ", "ツケ", "タ"),
         ),
     ]
 )
 def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms):
-    Token.set_extension("inflection", default="")
-    Token.set_extension("reading_form", default="")
-    doc = ja_tokenizer(text)
-    assert [t._.inflection for t in doc] == inflections
-    assert [t._.reading_form for t in doc] == reading_forms
+    assert ja_tokenizer(text).user_data["inflections"] == inflections
+    assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
 
 
 def test_ja_tokenizer_emptyish_texts(ja_tokenizer):

From 8b0f7496062cc0570c334778a8d21b1a3408e478 Mon Sep 17 00:00:00 2001
From: Matthias Hertel <hertelm@informatik.uni-freiburg.de>
Date: Tue, 30 Jun 2020 19:58:23 +0200
Subject: [PATCH 16/43] Website: fixed the token span in the text about the
 rule-based matching example (#5669)

* fixed token span in pattern matcher example

* contributor agreement
---
 .github/contributors/hertelm.md           | 106 ++++++++++++++++++++++
 website/docs/usage/rule-based-matching.md |   2 +-
 2 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 .github/contributors/hertelm.md

diff --git a/.github/contributors/hertelm.md b/.github/contributors/hertelm.md
new file mode 100644
index 000000000..ba4250bfc
--- /dev/null
+++ b/.github/contributors/hertelm.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Matthias Hertel      |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | June 29, 2020        |
+| GitHub username                | hertelm              |
+| Website (optional)             |                      |
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index f7866fe31..252aa8c77 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -122,7 +122,7 @@ for match_id, start, end in matches:
 ```
 
 The matcher returns a list of `(match_id, start, end)` tuples – in this case,
-`[('15578876784678163569', 0, 2)]`, which maps to the span `doc[0:2]` of our
+`[('15578876784678163569', 0, 3)]`, which maps to the span `doc[0:3]` of our
 original document. The `match_id` is the [hash value](/usage/spacy-101#vocab) of
 the string ID "HelloWorld". To get the string value, you can look up the ID in
 the [`StringStore`](/api/stringstore).

From ff0dbe5c6413b62a40f3268987844f0ce2a34a16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Abella=20Bascar=C3=A1n?=
 <alvaroabascar@gmail.com>
Date: Tue, 30 Jun 2020 20:00:50 +0200
Subject: [PATCH 17/43] Fix in docs: pipe(docs) instead of pipe(texts) (#5680)

Very minor fix in docs, specifically in this part:

```
 matcher = PhraseMatcher(nlp.vocab)
>   for doc in matcher.pipe(texts, batch_size=50):
>       pass
```

`texts` suggests the input is an iterable of strings. I replaced it for `docs`.
---
 website/docs/api/phrasematcher.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md
index a72277420..49211174c 100644
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@@ -91,7 +91,7 @@ Match a stream of documents, yielding them in turn.
 > ```python
 >   from spacy.matcher import PhraseMatcher
 >   matcher = PhraseMatcher(nlp.vocab)
->   for doc in matcher.pipe(texts, batch_size=50):
+>   for doc in matcher.pipe(docs, batch_size=50):
 >       pass
 > ```
 

From f2a932a60c09766cda0c4b5c534bf94a7ad09add Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 1 Jul 2020 13:34:35 +0200
Subject: [PATCH 18/43] Update netlify.toml [ci skip]

---
 netlify.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/netlify.toml b/netlify.toml
index be809f1d4..452b5979a 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -1,6 +1,8 @@
 redirects = [
     # Netlify
     {from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat", force = true },
+    # Subdomain for branches
+    {from = "https://nightly.spacy.io/*", to="https://spacy-io-develop.spacy.io/:splat", force = true, status = 200},
     # Old subdomains
     {from = "https://survey.spacy.io/*", to = "https://spacy.io", force = true},
     {from = "http://survey.spacy.io/*", to = "https://spacy.io", force = true},

From 6bc643d2e2a0812a4490f9ecc66bb480529a3b8f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 1 Jul 2020 21:34:17 +0200
Subject: [PATCH 19/43] Update netlify.toml [ci skip]

---
 netlify.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/netlify.toml b/netlify.toml
index 452b5979a..9cb11ae81 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -2,7 +2,7 @@ redirects = [
     # Netlify
     {from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat", force = true },
     # Subdomain for branches
-    {from = "https://nightly.spacy.io/*", to="https://spacy-io-develop.spacy.io/:splat", force = true, status = 200},
+    {from = "https://nightly.spacy.io/*", to="https://nightly-spacy-io.spacy.io/:splat", force = true, status = 200},
     # Old subdomains
     {from = "https://survey.spacy.io/*", to = "https://spacy.io", force = true},
     {from = "http://survey.spacy.io/*", to = "https://spacy.io", force = true},

From 2bd78c39e33b90f788b1121b93b3b098c4c4af10 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 2 Jul 2020 10:36:07 +0200
Subject: [PATCH 20/43] Fix multiple context manages in examples (#5690)

---
 examples/training/rehearsal.py             | 2 +-
 examples/training/train_ner.py             | 2 +-
 examples/training/train_new_entity_type.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py
index 24b1cea00..1cdac02aa 100644
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@@ -67,7 +67,7 @@ def main(model_name, unlabelled_loc):
     pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
     sizes = compounding(1.0, 4.0, 1.001)
-    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
         # show warnings for misaligned entity spans once
         warnings.filterwarnings("once", category=UserWarning, module='spacy')
 
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index ff6029567..f64ba801a 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -59,7 +59,7 @@ def main(model=None, output_dir=None, n_iter=100):
     pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
     # only train NER
-    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
         # show warnings for misaligned entity spans once
         warnings.filterwarnings("once", category=UserWarning, module='spacy')
 
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index e8ff6802a..a14688012 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -99,7 +99,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
     pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
     # only train NER
-    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
         # show warnings for misaligned entity spans once
         warnings.filterwarnings("once", category=UserWarning, module='spacy')
 

From 971826a96da9d114a86cbfb8b4bb9ab026abe8e6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 2 Jul 2020 17:10:27 +0200
Subject: [PATCH 21/43] Include git commit in package and model meta (#5694)

* Include git commit in package and model meta

* Rewrite to read file in setup

* Fix file handle
---
 .gitignore        |  1 +
 MANIFEST.in       |  1 +
 setup.py          | 51 +++++++++++++++++++++++++++++++++++++++++++++++
 spacy/language.py |  2 ++
 4 files changed, 55 insertions(+)

diff --git a/.gitignore b/.gitignore
index edcbba4d5..eb6be73dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,6 +70,7 @@ Pipfile.lock
 *.egg
 .eggs
 MANIFEST
+spacy/git_info.py
 
 # Temporary files
 *.~*
diff --git a/MANIFEST.in b/MANIFEST.in
index 1947b9140..9819c7b70 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -6,3 +6,4 @@ include bin/spacy
 include pyproject.toml
 recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
+recursive-include licenses *
diff --git a/setup.py b/setup.py
index 62a09aa73..01e372e91 100755
--- a/setup.py
+++ b/setup.py
@@ -118,6 +118,55 @@ def is_source_release(path):
     return os.path.exists(os.path.join(path, "PKG-INFO"))
 
 
+# Include the git version in the build (adapted from NumPy)
+# Copyright (c) 2005-2020, NumPy Developers.
+# BSD 3-Clause license, see licenses/3rd_party_licenses.txt
+def write_git_info_py(filename="spacy/git_info.py"):
+    def _minimal_ext_cmd(cmd):
+        # construct minimal environment
+        env = {}
+        for k in ["SYSTEMROOT", "PATH", "HOME"]:
+            v = os.environ.get(k)
+            if v is not None:
+                env[k] = v
+        # LANGUAGE is used on win32
+        env["LANGUAGE"] = "C"
+        env["LANG"] = "C"
+        env["LC_ALL"] = "C"
+        out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, env=env)
+        return out
+
+    git_version = "Unknown"
+    if os.path.exists(".git"):
+        try:
+            out = _minimal_ext_cmd(["git", "rev-parse", "--short", "HEAD"])
+            git_version = out.strip().decode("ascii")
+        except:
+            pass
+    elif os.path.exists(filename):
+        # must be a source distribution, use existing version file
+        try:
+            a = open(filename, "r")
+            lines = a.readlines()
+            git_version = lines[-1].split('"')[1]
+        except:
+            pass
+        finally:
+            a.close()
+
+    text = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY
+#
+GIT_VERSION = "%(git_version)s"
+"""
+    a = open(filename, "w")
+    try:
+        a.write(
+            text % {"git_version": git_version,}
+        )
+    finally:
+        a.close()
+
+
 def clean(path):
     for name in MOD_NAMES:
         name = name.replace(".", "/")
@@ -140,6 +189,8 @@ def chdir(new_dir):
 
 
 def setup_package():
+    write_git_info_py()
+
     root = os.path.abspath(os.path.dirname(__file__))
 
     if len(sys.argv) > 1 and sys.argv[1] == "clean":
diff --git a/spacy/language.py b/spacy/language.py
index faa0447a4..e9d195453 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -34,6 +34,7 @@ from .lang.tag_map import TAG_MAP
 from .tokens import Doc
 from .lang.lex_attrs import LEX_ATTRS, is_stop
 from .errors import Errors, Warnings
+from .git_info import GIT_VERSION
 from . import util
 from . import about
 
@@ -206,6 +207,7 @@ class Language(object):
         self._meta.setdefault("email", "")
         self._meta.setdefault("url", "")
         self._meta.setdefault("license", "")
+        self._meta.setdefault("spacy_git_version", GIT_VERSION)
         self._meta["vectors"] = {
             "width": self.vocab.vectors_length,
             "vectors": len(self.vocab.vectors),

From a77c4c3465d12f70fc2436b6d3def414082d77a9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 2 Jul 2020 17:11:57 +0200
Subject: [PATCH 22/43] Add strings and ENT_KB_ID to Doc serialization (#5691)

* Add strings for all writeable Token attributes to `Doc.to/from_bytes()`.
* Add ENT_KB_ID to default attributes.
---
 spacy/tests/doc/test_doc_api.py |  6 ++++++
 spacy/tokens/doc.pyx            | 15 ++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 6801d7844..388cd78fe 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -106,10 +106,16 @@ def test_doc_api_getitem(en_tokenizer):
 )
 def test_doc_api_serialize(en_tokenizer, text):
     tokens = en_tokenizer(text)
+    tokens[0].lemma_ = "lemma"
+    tokens[0].norm_ = "norm"
+    tokens[0].ent_kb_id_ = "ent_kb_id"
     new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
     assert tokens.text == new_tokens.text
     assert [t.text for t in tokens] == [t.text for t in new_tokens]
     assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
+    assert new_tokens[0].lemma_ == "lemma"
+    assert new_tokens[0].norm_ == "norm"
+    assert new_tokens[0].ent_kb_id_ == "ent_kb_id"
 
     new_tokens = Doc(tokens.vocab).from_bytes(
         tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 25a147208..5b03dc5d2 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -892,7 +892,7 @@ cdef class Doc:
 
         DOCS: https://spacy.io/api/doc#to_bytes
         """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM]  # TODO: ENT_KB_ID ?
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
         if self.is_tagged:
             array_head.extend([TAG, POS])
         # If doc parsed add head and dep attribute
@@ -901,6 +901,14 @@ cdef class Doc:
         # Otherwise add sent_start
         else:
             array_head.append(SENT_START)
+        strings = set()
+        for token in self:
+            strings.add(token.tag_)
+            strings.add(token.lemma_)
+            strings.add(token.dep_)
+            strings.add(token.ent_type_)
+            strings.add(token.ent_kb_id_)
+            strings.add(token.norm_)
         # Msgpack doesn't distinguish between lists and tuples, which is
         # vexing for user data. As a best guess, we *know* that within
         # keys, we must have tuples. In values we just have to hope
@@ -912,6 +920,7 @@ cdef class Doc:
             "sentiment": lambda: self.sentiment,
             "tensor": lambda: self.tensor,
             "cats": lambda: self.cats,
+            "strings": lambda: list(strings),
         }
         for key in kwargs:
             if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
@@ -942,6 +951,7 @@ cdef class Doc:
             "sentiment": lambda b: None,
             "tensor": lambda b: None,
             "cats": lambda b: None,
+            "strings": lambda b: None,
             "user_data_keys": lambda b: None,
             "user_data_values": lambda b: None,
         }
@@ -965,6 +975,9 @@ cdef class Doc:
             self.tensor = msg["tensor"]
         if "cats" not in exclude and "cats" in msg:
             self.cats = msg["cats"]
+        if "strings" not in exclude and "strings" in msg:
+            for s in msg["strings"]:
+                self.vocab.strings.add(s)
         start = 0
         cdef const LexemeC* lex
         cdef unicode orth_

From 2fb9bd795da1670e7ed7f3134652cf31aac10a96 Mon Sep 17 00:00:00 2001
From: Matthias Hertel <hertelm@informatik.uni-freiburg.de>
Date: Fri, 3 Jul 2020 10:24:02 +0200
Subject: [PATCH 23/43] Fixed vocabulary in the entity linker training example
 (#5676)

* entity linker training example: model loading changed according to issue 5668 (https://github.com/explosion/spaCy/issues/5668) + vocab_path is a required argument

* contributor agreement
---
 examples/training/train_entity_linker.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py
index 3a8deb7a0..a68007504 100644
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@@ -60,12 +60,12 @@ TRAIN_DATA = sample_train_data()
     output_dir=("Optional output directory", "option", "o", Path),
     n_iter=("Number of training iterations", "option", "n", int),
 )
-def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
+def main(kb_path, vocab_path, output_dir=None, n_iter=50):
     """Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
     The `vocab` should be the one used during creation of the KB."""
-    vocab = Vocab().from_disk(vocab_path)
     # create blank English model with correct vocab
-    nlp = spacy.blank("en", vocab=vocab)
+    nlp = spacy.blank("en")
+    nlp.vocab.from_disk(vocab_path)
     nlp.vocab.vectors.name = "spacy_pretrained_vectors"
     print("Created blank 'en' model with vocab from '%s'" % vocab_path)
 

From 86d13a9fb84cee2df56a998446d340207dfdbd5f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 3 Jul 2020 13:38:41 +0200
Subject: [PATCH 24/43] Set version to 2.3.1 (#5705)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 91810fa68..cd97fa987 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "2.3.0"
+__version__ = "2.3.1"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 3e78e82a834b42ff165fbb2614cb6f42206ce390 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 5 Jul 2020 15:48:39 +0200
Subject: [PATCH 25/43] Experimental character-based pretraining (#5700)

* Use cosine loss in Cloze multitask

* Fix char_embed for gpu

* Call resume_training for base model in train CLI

* Fix bilstm_depth default in pretrain command

* Implement character-based pretraining objective

* Use chars loss in ClozeMultitask

* Add method to decode predicted characters

* Fix number characters

* Rescale gradients for mlm

* Fix char embed+vectors in ml

* Fix pipes

* Fix pretrain args

* Move get_characters_loss

* Fix import

* Fix import

* Mention characters loss option in pretrain

* Remove broken 'self attention' option in pretrain

* Revert "Remove broken 'self attention' option in pretrain"

This reverts commit 56b820f6afaef14e2cab9a6ff9f5edc58f806554.

* Document 'characters' objective of pretrain
---
 spacy/_ml.py                | 20 ++++++++++++--
 spacy/cli/pretrain.py       | 38 ++++++++++++++++-----------
 spacy/cli/train.py          |  2 +-
 spacy/ml/_legacy_tok2vec.py | 11 +++++++-
 spacy/pipeline/pipes.pyx    | 52 +++++++++++++++++++++++++++----------
 website/docs/api/cli.md     |  2 +-
 6 files changed, 92 insertions(+), 33 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 60a0bbee0..d947aab1c 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -14,7 +14,7 @@ from thinc.api import with_getitem, flatten_add_lengths
 from thinc.api import uniqued, wrap, noop
 from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module, copy_array
+from thinc.neural.util import get_array_module, copy_array, to_categorical
 from thinc.neural.optimizers import Adam
 
 from thinc import describe
@@ -840,6 +840,8 @@ def masked_language_model(vocab, model, mask_prob=0.15):
 
         def mlm_backward(d_output, sgd=None):
             d_output *= 1 - mask
+            # Rescale gradient for number of instances.
+            d_output *= mask.size - mask.sum()
             return backprop(d_output, sgd=sgd)
 
         return output, mlm_backward
@@ -944,7 +946,7 @@ class CharacterEmbed(Model):
         # for the tip.
         nCv = self.ops.xp.arange(self.nC)
         for doc in docs:
-            doc_ids = doc.to_utf8_array(nr_char=self.nC)
+            doc_ids = self.ops.asarray(doc.to_utf8_array(nr_char=self.nC))
             doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
             # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
             # incantation do I chant to get
@@ -986,3 +988,17 @@ def get_cossim_loss(yh, y, ignore_zeros=False):
         losses[zero_indices] = 0
     loss = losses.sum()
     return loss, -d_yh
+
+
+def get_characters_loss(ops, docs, prediction, nr_char=10):
+    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
+    target_ids = target_ids.reshape((-1,))
+    target = ops.asarray(to_categorical(target_ids, nb_classes=256), dtype="f")
+    target = target.reshape((-1, 256*nr_char))
+    diff = prediction - target
+    loss = (diff**2).sum()
+    d_target = diff / float(prediction.shape[0])
+    return loss, d_target
+
+
+
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index aaec1ea75..6d6c65161 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -18,7 +18,8 @@ from ..errors import Errors
 from ..tokens import Doc
 from ..attrs import ID, HEAD
 from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
-from .._ml import masked_language_model, get_cossim_loss
+from .._ml import masked_language_model, get_cossim_loss, get_characters_loss
+from .._ml import MultiSoftmax
 from .. import util
 from .train import _load_pretrained_tok2vec
 
@@ -42,7 +43,7 @@ from .train import _load_pretrained_tok2vec
     bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
     embed_rows=("Number of embedding rows", "option", "er", int),
     loss_func=(
-        "Loss function to use for the objective. Either 'L2' or 'cosine'",
+        "Loss function to use for the objective. Either 'characters', 'L2' or 'cosine'",
         "option",
         "L",
         str,
@@ -85,11 +86,11 @@ def pretrain(
     output_dir,
     width=96,
     conv_depth=4,
-    bilstm_depth=0,
     cnn_pieces=3,
     sa_depth=0,
-    use_chars=False,
     cnn_window=1,
+    bilstm_depth=0,
+    use_chars=False,
     embed_rows=2000,
     loss_func="cosine",
     use_vectors=False,
@@ -124,11 +125,7 @@ def pretrain(
             config[key] = str(config[key])
     util.fix_random_seed(seed)
 
-    has_gpu = prefer_gpu()
-    if has_gpu:
-        import torch
-
-        torch.set_default_tensor_type("torch.cuda.FloatTensor")
+    has_gpu = prefer_gpu(gpu_id=1)
     msg.info("Using GPU" if has_gpu else "Not using GPU")
 
     output_dir = Path(output_dir)
@@ -174,6 +171,7 @@ def pretrain(
             subword_features=not use_chars,  # Set to False for Chinese etc
             cnn_maxout_pieces=cnn_pieces,  # If set to 1, use Mish activation.
         ),
+        objective=loss_func
     )
     # Load in pretrained weights
     if init_tok2vec is not None:
@@ -264,7 +262,10 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
     RETURNS loss: A float for the loss.
     """
     predictions, backprop = model.begin_update(docs, drop=drop)
-    loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
+    if objective == "characters":
+        loss, gradients = get_characters_loss(model.ops, docs, predictions)
+    else:
+        loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
     backprop(gradients, sgd=optimizer)
     # Don't want to return a cupy object here
     # The gradients are modified in-place by the BERT MLM,
@@ -326,16 +327,23 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
     return loss, d_target
 
 
-def create_pretraining_model(nlp, tok2vec):
+def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10):
     """Define a network for the pretraining. We simply add an output layer onto
     the tok2vec input model. The tok2vec input model needs to be a model that
     takes a batch of Doc objects (as a list), and returns a list of arrays.
     Each array in the output needs to have one row per token in the doc.
     """
-    output_size = nlp.vocab.vectors.data.shape[1]
-    output_layer = chain(
-        LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
-    )
+    if objective == "characters":
+        out_sizes = [256] * nr_char
+        output_layer = chain(
+            LN(Maxout(300, pieces=3)),
+            MultiSoftmax(out_sizes, 300)
+        )
+    else:
+        output_size = nlp.vocab.vectors.data.shape[1]
+        output_layer = chain(
+            LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
+        )
     # This is annoying, but the parser etc have the flatten step after
     # the tok2vec. To load the weights in cleanly, we need to match
     # the shape of the models' components exactly. So what we cann
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index d4de9aeb4..fc4c9f67b 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -285,7 +285,7 @@ def train(
 
     if base_model and not pipes_added:
         # Start with an existing model, use default optimizer
-        optimizer = create_default_optimizer(Model.ops)
+        optimizer = nlp.resume_training(device=use_gpu)
     else:
         # Start with a blank model, call begin_training
         cfg = {"device": use_gpu}
diff --git a/spacy/ml/_legacy_tok2vec.py b/spacy/ml/_legacy_tok2vec.py
index b077a46b7..3e41b1c6a 100644
--- a/spacy/ml/_legacy_tok2vec.py
+++ b/spacy/ml/_legacy_tok2vec.py
@@ -49,6 +49,14 @@ def Tok2Vec(width, embed_size, **kwargs):
                     >> LN(Maxout(width, width * 5, pieces=3)),
                     column=cols.index(ORTH),
                 )
+            elif char_embed:
+                embed = concatenate_lists(
+                    CharacterEmbed(nM=64, nC=8),
+                    FeatureExtracter(cols) >> with_flatten(glove),
+                )
+                reduce_dimensions = LN(
+                    Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
+                )
             else:
                 embed = uniqued(
                     (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
@@ -81,7 +89,8 @@ def Tok2Vec(width, embed_size, **kwargs):
             )
         else:
             tok2vec = FeatureExtracter(cols) >> with_flatten(
-                embed >> convolution ** conv_depth, pad=conv_depth
+                embed
+                >> convolution ** conv_depth, pad=conv_depth
             )
 
         if bilstm_depth >= 1:
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 8f07bf8f7..b28f34a7a 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -33,6 +33,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier
 from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
+from .._ml import MultiSoftmax, get_characters_loss
 from ..errors import Errors, TempErrors, Warnings
 from .. import util
 
@@ -846,11 +847,15 @@ class MultitaskObjective(Tagger):
 class ClozeMultitask(Pipe):
     @classmethod
     def Model(cls, vocab, tok2vec, **cfg):
-        output_size = vocab.vectors.data.shape[1]
-        output_layer = chain(
-            LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
-            zero_init(Affine(output_size, output_size, drop_factor=0.0))
-        )
+        if cfg["objective"] == "characters":
+            out_sizes = [256] * cfg.get("nr_char", 4)
+            output_layer = MultiSoftmax(out_sizes)
+        else:
+            output_size = vocab.vectors.data.shape[1]
+            output_layer = chain(
+                LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
+                zero_init(Affine(output_size, output_size, drop_factor=0.0))
+            )
         model = chain(tok2vec, output_layer)
         model = masked_language_model(vocab, model)
         model.tok2vec = tok2vec
@@ -861,6 +866,8 @@ class ClozeMultitask(Pipe):
         self.vocab = vocab
         self.model = model
         self.cfg = cfg
+        self.cfg.setdefault("objective", "characters")
+        self.cfg.setdefault("nr_char", 4)
 
     def set_annotations(self, docs, dep_ids, tensors=None):
         pass
@@ -869,7 +876,8 @@ class ClozeMultitask(Pipe):
                         tok2vec=None, sgd=None, **kwargs):
         link_vectors_to_models(self.vocab)
         if self.model is True:
-            self.model = self.Model(self.vocab, tok2vec)
+            kwargs.update(self.cfg)
+            self.model = self.Model(self.vocab, tok2vec, **kwargs)
         X = self.model.ops.allocate((5, self.model.tok2vec.nO))
         self.model.output_layer.begin_training(X)
         if sgd is None:
@@ -883,13 +891,16 @@ class ClozeMultitask(Pipe):
         return tokvecs, vectors
 
     def get_loss(self, docs, vectors, prediction):
-        # The simplest way to implement this would be to vstack the
-        # token.vector values, but that's a bit inefficient, especially on GPU.
-        # Instead we fetch the index into the vectors table for each of our tokens,
-        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-        target = vectors[ids]
-        loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
+        if self.cfg["objective"] == "characters":
+            loss, gradient = get_characters_loss(self.model.ops, docs, prediction)
+        else:
+            # The simplest way to implement this would be to vstack the
+            # token.vector values, but that's a bit inefficient, especially on GPU.
+            # Instead we fetch the index into the vectors table for each of our tokens,
+            # and look them up all at once. This prevents data copying.
+            ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+            target = vectors[ids]
+            loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
         return float(loss), gradient
 
     def update(self, docs, golds, drop=0., sgd=None, losses=None):
@@ -906,6 +917,20 @@ class ClozeMultitask(Pipe):
         if losses is not None:
             losses[self.name] += loss
 
+    @staticmethod
+    def decode_utf8_predictions(char_array):
+        # The format alternates filling from start and end, and 255 is missing
+        words = []
+        char_array = char_array.reshape((char_array.shape[0], -1, 256))
+        nr_char = char_array.shape[1]
+        char_array = char_array.argmax(axis=-1)
+        for row in char_array:
+            starts = [chr(c) for c in row[::2] if c != 255]
+            ends = [chr(c) for c in row[1::2] if c != 255]
+            word = "".join(starts + list(reversed(ends)))
+            words.append(word)
+        return words
+
 
 @component("textcat", assigns=["doc.cats"])
 class TextCategorizer(Pipe):
@@ -1069,6 +1094,7 @@ cdef class DependencyParser(Parser):
     assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
     requires = []
     TransitionSystem = ArcEager
+    nr_feature = 8
 
     @property
     def postprocesses(self):
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index fe8877c69..779fa7695 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -473,7 +473,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
 | `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag>  | flag       | Whether to use character-based embedding.                                                                                                                                       |
 | `--sa-depth`, `-sa` <Tag variant="new">2.2.2</Tag>    | option     | Depth of self-attention layers.                                                                                                                                                 |
 | `--embed-rows`, `-er`                                 | option     | Number of embedding rows.                                                                                                                                                       |
-| `--loss-func`, `-L`                                   | option     | Loss function to use for the objective. Either `"L2"` or `"cosine"`.                                                                                                            |
+| `--loss-func`, `-L`                                   | option     | Loss function to use for the objective. Either `"cosine"`, `"L2"` or `"characters"`.                                                                                                            |
 | `--dropout`, `-d`                                     | option     | Dropout rate.                                                                                                                                                                   |
 | `--batch-size`, `-bs`                                 | option     | Number of words per training batch.                                                                                                                                             |
 | `--max-length`, `-xw`                                 | option     | Maximum words per example. Longer examples are discarded.                                                                                                                       |

From 9860b8399ed2a3d1d680e1c1cd31d85926422709 Mon Sep 17 00:00:00 2001
From: graue70 <23035329+graue70@users.noreply.github.com>
Date: Sun, 5 Jul 2020 15:49:06 +0200
Subject: [PATCH 26/43] Fix typo in test function docstring (#5696)

---
 spacy/tests/regression/test_issue2501-3000.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 1f5e44499..622fc3635 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -59,7 +59,7 @@ def test_issue2626_2835(en_tokenizer, text):
 
 
 def test_issue2656(en_tokenizer):
-    """Test that tokenizer correctly splits of punctuation after numbers with
+    """Test that tokenizer correctly splits off punctuation after numbers with
     decimal points.
     """
     doc = en_tokenizer("I went for 40.3, and got home by 10.0.")

From 7a2ca00794da43edc9c55e690b647d5b5b962e42 Mon Sep 17 00:00:00 2001
From: Mike Izbicki <mike@izbicki.me>
Date: Mon, 6 Jul 2020 08:03:33 -0700
Subject: [PATCH 27/43] fix bug in Korean language, resulting in 100x speedup
 by reducing overhead of mecab (#5701)

* speed up Korean nlp 100x by stopping mecab from reloading on each doc

* add contributor agreement

* rename variables to improve code readability
---
 .github/contributors/mikeizbicki.md | 106 ++++++++++++++++++++++++++++
 spacy/lang/ko/__init__.py           |  27 +++----
 2 files changed, 121 insertions(+), 12 deletions(-)
 create mode 100644 .github/contributors/mikeizbicki.md

diff --git a/.github/contributors/mikeizbicki.md b/.github/contributors/mikeizbicki.md
new file mode 100644
index 000000000..6e9d8c098
--- /dev/null
+++ b/.github/contributors/mikeizbicki.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Mike Izbicki         |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 02 Jun 2020          |
+| GitHub username                | mikeizbicki          |
+| Website (optional)             | https://izbicki.me   |
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index ec79a95ab..21a754168 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -42,7 +42,11 @@ def check_spaces(text, tokens):
 class KoreanTokenizer(DummyTokenizer):
     def __init__(self, cls, nlp=None):
         self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
-        self.Tokenizer = try_mecab_import()
+        MeCab = try_mecab_import()
+        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
+
+    def __del__(self):
+        self.mecab_tokenizer.__del__()
 
     def __call__(self, text):
         dtokens = list(self.detailed_tokens(text))
@@ -58,17 +62,16 @@ class KoreanTokenizer(DummyTokenizer):
     def detailed_tokens(self, text):
         # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
         # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
-        with self.Tokenizer("-F%f[0],%f[7]") as tokenizer:
-            for node in tokenizer.parse(text, as_nodes=True):
-                if node.is_eos():
-                    break
-                surface = node.surface
-                feature = node.feature
-                tag, _, expr = feature.partition(",")
-                lemma, _, remainder = expr.partition("/")
-                if lemma == "*":
-                    lemma = surface
-                yield {"surface": surface, "lemma": lemma, "tag": tag}
+        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
+            if node.is_eos():
+                break
+            surface = node.surface
+            feature = node.feature
+            tag, _, expr = feature.partition(",")
+            lemma, _, remainder = expr.partition("/")
+            if lemma == "*":
+                lemma = surface
+            yield {"surface": surface, "lemma": lemma, "tag": tag}
 
 
 class KoreanDefaults(Language.Defaults):

From 546f3d10d4ab2f6e2d7149d13087a41480335ddd Mon Sep 17 00:00:00 2001
From: Jonathan Besomi <43236409+jbesomi@users.noreply.github.com>
Date: Tue, 7 Jul 2020 20:54:22 +0200
Subject: [PATCH 28/43] Add texthero to universe.json (#5716)

* Add texthero to universe.json

* Add spaCy contributor Agreement
---
 .github/contributors/jbesomi.md | 106 ++++++++++++++++++++++++++++++++
 website/meta/universe.json      |  26 ++++++++
 2 files changed, 132 insertions(+)
 create mode 100644 .github/contributors/jbesomi.md

diff --git a/.github/contributors/jbesomi.md b/.github/contributors/jbesomi.md
new file mode 100644
index 000000000..ac43a3bfd
--- /dev/null
+++ b/.github/contributors/jbesomi.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Jonathan B.          |
+| Company name (if applicable)   | besomi.ai            |
+| Title or role (if applicable)  | -                    |
+| Date                           | 07.07.2020           |
+| GitHub username                | jbesomi              |
+| Website (optional)             | besomi.ai            |
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 2c74a2964..1d732f088 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2347,6 +2347,32 @@
             },
             "category": ["pipeline", "conversational", "research"],
             "tags": ["spell check", "correction", "preprocessing", "translation", "correction"]
+        },
+        {
+            "id": "texthero",
+            "title": "Texthero",
+            "slogan": "Text preprocessing, representation and visualization from zero to hero.",
+            "description": "Texthero is a python package to work with text data efficiently. It empowers NLP developers with a tool to quickly understand any text-based dataset and it provides a solid pipeline to clean and represent text data, from zero to hero.",
+            "github": "jbesomi/texthero",
+            "pip": "texthero",
+            "code_example": [
+                "import texthero as hero",
+                "import pandas as pd",
+                "",
+                "df = pd.read_csv('https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv')",
+                "df['named_entities'] = hero.named_entities(df['text'])",
+                "df.head()"
+            ],
+            "code_language": "python",
+            "url": "https://texthero.org",
+            "thumb": "https://texthero.org/img/T.png",
+            "image": "https://texthero.org/docs/assets/texthero.png",
+            "author": "Jonathan Besomi",
+            "author_links": {
+                "github": "jbesomi",
+                "website": "https://besomi.ai"
+            },
+            "category": ["standalone"],
         }
     ],
 

From 9097549227c56a34fd00b47957de12010fe57d53 Mon Sep 17 00:00:00 2001
From: gandersen101 <gandersen.codes@gmail.com>
Date: Tue, 7 Jul 2020 13:55:24 -0500
Subject: [PATCH 29/43] Adding spaczz package to universe.json (#5717)

* Adding spaczz package to universe.json

* Adding contributor agreement.
---
 .github/contributors/gandersen101.md | 106 +++++++++++++++++++++++++++
 website/meta/universe.json           |  29 ++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 .github/contributors/gandersen101.md

diff --git a/.github/contributors/gandersen101.md b/.github/contributors/gandersen101.md
new file mode 100644
index 000000000..cae4ad047
--- /dev/null
+++ b/.github/contributors/gandersen101.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [ x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Grant Andersen       |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 07.06.2020           |
+| GitHub username                | gandersen101         |
+| Website (optional)             |                      |
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 1d732f088..e57f2bf70 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,34 @@
 {
     "resources": [
+        {
+            "id": "spaczz",
+            "title": "spaczz",
+            "slogan": "Fuzzy matching and more for spaCy.",
+            "description": "Spaczz provides fuzzy matching and multi-token regex matching functionality for spaCy. Spaczz's components have similar APIs to their spaCy counterparts and spaczz pipeline components can integrate into spaCy pipelines where they can be saved/loaded as models.",
+            "github": "gandersen101/spaczz",
+            "pip": "spaczz",
+            "code_example": [
+                "import spacy",
+                "from spaczz.pipeline import SpaczzRuler",
+                "",
+                "nlp = spacy.blank('en')",
+                "ruler = SpaczzRuler(nlp)",
+                "ruler.add_patterns([{'label': 'PERSON', 'pattern': 'Bill Gates', 'type': 'fuzzy'}])",
+                "nlp.add_pipe(ruler)",
+                "",
+                "doc = nlp('Oops, I spelled Bill Gattes' name wrong.')",
+                "print([(ent.text, ent.start, ent.end, ent.label_) for ent in doc.ents])"
+            ],
+            "code_language": "python",
+            "url": "https://spaczz.readthedocs.io/en/latest/",
+            "author": "Grant Andersen",
+            "author_links": {
+                "twitter": "gandersen101",
+                "github": "gandersen101"
+            },
+            "category": ["pipeline"],
+            "tags": ["fuzzy-matching", "regex"]
+        },
         {
             "id": "spacy-universal-sentence-encoder",
             "title": "SpaCy - Universal Sentence Encoder",

From 109849bd311490f17a29b320cb032e43d153f36f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 7 Jul 2020 21:12:28 +0200
Subject: [PATCH 30/43] Fix and update universe.json [ci skip]

---
 website/meta/universe.json | 53 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index e57f2bf70..2b6d82663 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,29 @@
 {
     "resources": [
+        {
+            "id": "spacy-streamlit",
+            "title": "spacy-streamlit",
+            "slogan": "spaCy building blocks for Streamlit apps",
+            "github": "explosion/spacy-streamlit",
+            "description": "This package contains utilities for visualizing spaCy models and building interactive spaCy-powered apps with [Streamlit](https://streamlit.io). It includes various building blocks you can use in your own Streamlit app, like visualizers for **syntactic dependencies**, **named entities**, **text classification**, **semantic similarity** via word vectors, token attributes, and more.",
+            "pip": "spacy-streamlit",
+            "category": ["visualizers"],
+            "thumb": "https://i.imgur.com/mhEjluE.jpg",
+            "image": "https://user-images.githubusercontent.com/13643239/85388081-f2da8700-b545-11ea-9bd4-e303d3c5763c.png",
+            "code_example": [
+                "import spacy_streamlit",
+                "",
+                "models = [\"en_core_web_sm\", \"en_core_web_md\"]",
+                "default_text = \"Sundar Pichai is the CEO of Google.\"",
+                "spacy_streamlit.visualize(models, default_text))"
+            ],
+            "author": "Ines Montani",
+            "author_links": {
+                "twitter": "_inesmontani",
+                "github": "ines",
+                "website": "https://ines.io"
+            }
+        },
         {
             "id": "spaczz",
             "title": "spaczz",
@@ -1266,6 +1290,19 @@
             "youtube": "K1elwpgDdls",
             "category": ["videos"]
         },
+        {
+            "type": "education",
+            "id": "video-spacy-course-es",
+            "title": "NLP avanzado con spaCy · Un curso en línea gratis",
+            "description": "spaCy es un paquete moderno de Python para hacer Procesamiento de Lenguaje Natural de potencia industrial. En este curso en línea, interactivo y gratuito, aprenderás a usar spaCy para construir sistemas avanzados de comprensión de lenguaje natural usando enfoques basados en reglas y en machine learning.",
+            "url": "https://course.spacy.io/es",
+            "author": "Camila Gutiérrez",
+            "author_links": {
+                "twitter": "Mariacamilagl30"
+            },
+            "youtube": "RNiLVCE5d4k",
+            "category": ["videos"]
+        },
         {
             "type": "education",
             "id": "video-intro-to-nlp-episode-1",
@@ -1322,6 +1359,20 @@
             "youtube": "IqOJU1-_Fi0",
             "category": ["videos"]
         },
+        {
+            "type": "education",
+            "id": "video-intro-to-nlp-episode-5",
+            "title": "Intro to NLP with spaCy (5)",
+            "slogan": "Episode 5: Rules vs. Machine Learning",
+            "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.",
+            "author": "Vincent Warmerdam",
+            "author_links": {
+                "twitter": "fishnets88",
+                "github": "koaning"
+            },
+            "youtube": "f4sqeLRzkPg",
+            "category": ["videos"]
+        },
         {
             "type": "education",
             "id": "video-spacy-irl-entity-linking",
@@ -2401,7 +2452,7 @@
                 "github": "jbesomi",
                 "website": "https://besomi.ai"
             },
-            "category": ["standalone"],
+            "category": ["standalone"]
         }
     ],
 

From 893133873d8ef906b37b2214fe34a1b4b94b2d3e Mon Sep 17 00:00:00 2001
From: gandersen101 <gandersen.codes@gmail.com>
Date: Tue, 7 Jul 2020 19:16:28 -0500
Subject: [PATCH 31/43] Fix quote issue in spaczz universe.json

---
 website/meta/universe.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 2b6d82663..c5eb96e43 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -40,7 +40,7 @@
                 "ruler.add_patterns([{'label': 'PERSON', 'pattern': 'Bill Gates', 'type': 'fuzzy'}])",
                 "nlp.add_pipe(ruler)",
                 "",
-                "doc = nlp('Oops, I spelled Bill Gattes' name wrong.')",
+                "doc = nlp('Oops, I spelled Bill Gatez wrong.')",
                 "print([(ent.text, ent.start, ent.end, ent.label_) for ent in doc.ents])"
             ],
             "code_language": "python",

From 923affd091dac4c1a7e11168f7c8f1f05dcc224e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 9 Jul 2020 22:11:13 +0200
Subject: [PATCH 32/43] Remove is_base_form from French lemmatizer (#5733)

Remove English-specific is_base_form from French lemmatizer.
---
 spacy/lang/fr/lemmatizer.py | 40 -------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py
index 79f4dd28d..af8345e1b 100644
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@@ -45,9 +45,6 @@ class FrenchLemmatizer(Lemmatizer):
             univ_pos = "sconj"
         else:
             return [self.lookup(string)]
-        # See Issue #435 for example of where this logic is requied.
-        if self.is_base_form(univ_pos, morphology):
-            return list(set([string.lower()]))
         index_table = self.lookups.get_table("lemma_index", {})
         exc_table = self.lookups.get_table("lemma_exc", {})
         rules_table = self.lookups.get_table("lemma_rules", {})
@@ -59,43 +56,6 @@ class FrenchLemmatizer(Lemmatizer):
         )
         return lemmas
 
-    def is_base_form(self, univ_pos, morphology=None):
-        """
-        Check whether we're dealing with an uninflected paradigm, so we can
-        avoid lemmatization entirely.
-        """
-        morphology = {} if morphology is None else morphology
-        others = [
-            key
-            for key in morphology
-            if key not in (POS, "Number", "POS", "VerbForm", "Tense")
-        ]
-        if univ_pos == "noun" and morphology.get("Number") == "sing":
-            return True
-        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
-            return True
-        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
-        # morphology
-        elif univ_pos == "verb" and (
-            morphology.get("VerbForm") == "fin"
-            and morphology.get("Tense") == "pres"
-            and morphology.get("Number") is None
-            and not others
-        ):
-            return True
-        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
-            return True
-        elif VerbForm_inf in morphology:
-            return True
-        elif VerbForm_none in morphology:
-            return True
-        elif Number_sing in morphology:
-            return True
-        elif Degree_pos in morphology:
-            return True
-        else:
-            return False
-
     def noun(self, string, morphology=None):
         return self(string, "noun", morphology)
 

From 0a62098c5f0e0abe640a76776ddf6ea7094e2c23 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 9 Jul 2020 22:11:24 +0200
Subject: [PATCH 33/43] Fix lemmatizer is_base_form for python2.7 (#5734)

* Fix lemmatizer init args for python2.7

* Move English is_base_form to a class method

* Skip test pickling PhraseMatcher for python2
---
 spacy/lang/en/__init__.py                     | 71 +++++++++----------
 spacy/lemmatizer.py                           |  2 +-
 spacy/tests/regression/test_issue3001-3500.py |  1 +
 3 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index d52f3dfd8..f58ae4a4e 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -18,41 +18,6 @@ def _return_en(_):
     return "en"
 
 
-def en_is_base_form(univ_pos, morphology=None):
-    """
-    Check whether we're dealing with an uninflected paradigm, so we can
-    avoid lemmatization entirely.
-
-    univ_pos (unicode / int): The token's universal part-of-speech tag.
-    morphology (dict): The token's morphological features following the
-        Universal Dependencies scheme.
-    """
-    if morphology is None:
-        morphology = {}
-    if univ_pos == "noun" and morphology.get("Number") == "sing":
-        return True
-    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
-        return True
-    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
-    # morphology
-    elif univ_pos == "verb" and (
-        morphology.get("VerbForm") == "fin"
-        and morphology.get("Tense") == "pres"
-        and morphology.get("Number") is None
-    ):
-        return True
-    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
-        return True
-    elif morphology.get("VerbForm") == "inf":
-        return True
-    elif morphology.get("VerbForm") == "none":
-        return True
-    elif morphology.get("Degree") == "pos":
-        return True
-    else:
-        return False
-
-
 class EnglishDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters.update(LEX_ATTRS)
@@ -61,7 +26,6 @@ class EnglishDefaults(Language.Defaults):
     tag_map = TAG_MAP
     stop_words = STOP_WORDS
     morph_rules = MORPH_RULES
-    is_base_form = en_is_base_form
     syntax_iterators = SYNTAX_ITERATORS
     single_orth_variants = [
         {"tags": ["NFP"], "variants": ["…", "..."]},
@@ -72,6 +36,41 @@ class EnglishDefaults(Language.Defaults):
         {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
     ]
 
+    @classmethod
+    def is_base_form(cls, univ_pos, morphology=None):
+        """
+        Check whether we're dealing with an uninflected paradigm, so we can
+        avoid lemmatization entirely.
+
+        univ_pos (unicode / int): The token's universal part-of-speech tag.
+        morphology (dict): The token's morphological features following the
+            Universal Dependencies scheme.
+        """
+        if morphology is None:
+            morphology = {}
+        if univ_pos == "noun" and morphology.get("Number") == "sing":
+            return True
+        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
+            return True
+        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
+        # morphology
+        elif univ_pos == "verb" and (
+            morphology.get("VerbForm") == "fin"
+            and morphology.get("Tense") == "pres"
+            and morphology.get("Number") is None
+        ):
+            return True
+        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
+            return True
+        elif morphology.get("VerbForm") == "inf":
+            return True
+        elif morphology.get("VerbForm") == "none":
+            return True
+        elif morphology.get("Degree") == "pos":
+            return True
+        else:
+            return False
+
 
 class English(Language):
     lang = "en"
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index f72eae128..8b2375257 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -21,7 +21,7 @@ class Lemmatizer(object):
     def load(cls, *args, **kwargs):
         raise NotImplementedError(Errors.E172)
 
-    def __init__(self, lookups, *args, is_base_form=None, **kwargs):
+    def __init__(self, lookups, is_base_form=None, *args, **kwargs):
         """Initialize a Lemmatizer.
 
         lookups (Lookups): The lookups object containing the (optional) tables
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index effbebb92..a10225390 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -121,6 +121,7 @@ def test_issue3248_1():
     assert len(matcher) == 2
 
 
+@pytest.mark.skipif(is_python2, reason="Can't pickle instancemethod for is_base_form")
 def test_issue3248_2():
     """Test that the PhraseMatcher can be pickled correctly."""
     nlp = English()

From 27a1cd3c630055802513a20c1e75d0b37943cc39 Mon Sep 17 00:00:00 2001
From: Mark Neumann <mark.neumann.1992@gmail.com>
Date: Sun, 12 Jul 2020 13:06:46 -0700
Subject: [PATCH 34/43] fix meta serialization in train (#5751)

Co-authored-by: Mark Neumann <markng@allenai.org>
---
 spacy/cli/train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index fc4c9f67b..b81214b95 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -576,6 +576,8 @@ def train(
         with nlp.use_params(optimizer.averages):
             final_model_path = output_path / "model-final"
             nlp.to_disk(final_model_path)
+            srsly.write_json(final_model_path / "meta.json", meta)
+
             meta_loc = output_path / "model-final" / "meta.json"
             final_meta = srsly.read_json(meta_loc)
             final_meta.setdefault("accuracy", {})

From 7ea2cc76508cdcd6d854381f00f1da79309a0df3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 13 Jul 2020 14:55:56 +0200
Subject: [PATCH 35/43] Set version to 2.3.2 (#5756)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index cd97fa987..42c38cda5 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "2.3.1"
+__version__ = "2.3.2"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 5228920e2fa3c1c067ec753ae40cfaea07908cfb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 14 Jul 2020 14:09:48 +0200
Subject: [PATCH 36/43] Clarify warning W030 for misaligned BILUO tags (#5761)

---
 spacy/errors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index a25661a20..ff71b60eb 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -113,8 +113,8 @@ class Warnings(object):
     W030 = ("Some entities could not be aligned in the text \"{text}\" with "
             "entities \"{entities}\". Use "
             "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
-            " to check the alignment. Misaligned entities ('-') will be "
-            "ignored during training.")
+            " to check the alignment. Misaligned entities (with BILUO tag '-') "
+            "will be ignored during training.")
     W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and "
             "is incompatible with the current spaCy version ({current}). This "
             "may lead to unexpected results or runtime errors. To resolve "

From 6f4e4aceb3710262ab376c79c5f740f72070b8e0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 18 Jul 2020 23:50:29 +0200
Subject: [PATCH 37/43] Add Plausible [ci skip]

---
 website/gatsby-config.js  |  6 +++
 website/meta/site.json    |  1 +
 website/package-lock.json | 80 +++++++++++++++++++++++++++++++++------
 website/package.json      |  1 +
 4 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index aacc25545..2a5f957f4 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -140,6 +140,12 @@ module.exports = {
                 respectDNT: true,
             },
         },
+        {
+            resolve: `gatsby-plugin-plausible`,
+            options: {
+                domain: site.domain,
+            },
+        },
         `gatsby-plugin-offline`,
     ],
 }
diff --git a/website/meta/site.json b/website/meta/site.json
index 8b8424f82..4d12a4c46 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -3,6 +3,7 @@
     "description": "spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.",
     "slogan": "Industrial-strength Natural Language Processing in Python",
     "siteUrl": "https://spacy.io",
+    "domain": "spacy.io",
     "email": "contact@explosion.ai",
     "company": "Explosion AI",
     "companyUrl": "https://explosion.ai",
diff --git a/website/package-lock.json b/website/package-lock.json
index cb1731c1b..dded33fb0 100644
--- a/website/package-lock.json
+++ b/website/package-lock.json
@@ -1424,6 +1424,7 @@
       "version": "0.8.0",
       "resolved": "https://registry.npmjs.org/@sindresorhus/slugify/-/slugify-0.8.0.tgz",
       "integrity": "sha512-Y+C3aG0JHmi4nCfixHgq0iAtqWCjMCliWghf6fXbemRKSGzpcrHdYxGZGDt8MeFg+gH7ounfMbz6WogqKCWvDg==",
+      "dev": true,
       "requires": {
         "escape-string-regexp": "^1.0.5",
         "lodash.deburr": "^4.1.0"
@@ -3570,7 +3571,8 @@
             },
             "ansi-regex": {
               "version": "2.1.1",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             },
             "aproba": {
               "version": "1.2.0",
@@ -3588,11 +3590,13 @@
             },
             "balanced-match": {
               "version": "1.0.0",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             },
             "brace-expansion": {
               "version": "1.1.11",
               "bundled": true,
+              "optional": true,
               "requires": {
                 "balanced-match": "^1.0.0",
                 "concat-map": "0.0.1"
@@ -3605,15 +3609,18 @@
             },
             "code-point-at": {
               "version": "1.1.0",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             },
             "concat-map": {
               "version": "0.0.1",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             },
             "console-control-strings": {
               "version": "1.1.0",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             },
             "core-util-is": {
               "version": "1.0.2",
@@ -3716,7 +3723,8 @@
             },
             "inherits": {
               "version": "2.0.3",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             },
             "ini": {
               "version": "1.3.5",
@@ -3726,6 +3734,7 @@
             "is-fullwidth-code-point": {
               "version": "1.0.0",
               "bundled": true,
+              "optional": true,
               "requires": {
                 "number-is-nan": "^1.0.0"
               }
@@ -3738,17 +3747,20 @@
             "minimatch": {
               "version": "3.0.4",
               "bundled": true,
+              "optional": true,
               "requires": {
                 "brace-expansion": "^1.1.7"
               }
             },
             "minimist": {
               "version": "0.0.8",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             },
             "minipass": {
               "version": "2.3.5",
               "bundled": true,
+              "optional": true,
               "requires": {
                 "safe-buffer": "^5.1.2",
                 "yallist": "^3.0.0"
@@ -3765,6 +3777,7 @@
             "mkdirp": {
               "version": "0.5.1",
               "bundled": true,
+              "optional": true,
               "requires": {
                 "minimist": "0.0.8"
               }
@@ -3837,7 +3850,8 @@
             },
             "number-is-nan": {
               "version": "1.0.1",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             },
             "object-assign": {
               "version": "4.1.1",
@@ -3847,6 +3861,7 @@
             "once": {
               "version": "1.4.0",
               "bundled": true,
+              "optional": true,
               "requires": {
                 "wrappy": "1"
               }
@@ -3922,7 +3937,8 @@
             },
             "safe-buffer": {
               "version": "5.1.2",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             },
             "safer-buffer": {
               "version": "2.1.2",
@@ -3952,6 +3968,7 @@
             "string-width": {
               "version": "1.0.2",
               "bundled": true,
+              "optional": true,
               "requires": {
                 "code-point-at": "^1.0.0",
                 "is-fullwidth-code-point": "^1.0.0",
@@ -3969,6 +3986,7 @@
             "strip-ansi": {
               "version": "3.0.1",
               "bundled": true,
+              "optional": true,
               "requires": {
                 "ansi-regex": "^2.0.0"
               }
@@ -4007,11 +4025,13 @@
             },
             "wrappy": {
               "version": "1.0.2",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             },
             "yallist": {
               "version": "3.0.3",
-              "bundled": true
+              "bundled": true,
+              "optional": true
             }
           }
         },
@@ -7482,6 +7502,41 @@
         "slash": "^1.0.0"
       }
     },
+    "gatsby-plugin-plausible": {
+      "version": "0.0.6",
+      "resolved": "https://registry.npmjs.org/gatsby-plugin-plausible/-/gatsby-plugin-plausible-0.0.6.tgz",
+      "integrity": "sha512-qUdPQ3haeX2DIywGZ2boMpmFAnSbWzqS9cG9/OO0mWLigA0sDLWwGkpHIAvrfepgbB9U/roLtXflctBwOIxtcQ==",
+      "requires": {
+        "@babel/runtime": "^7.9.2",
+        "minimatch": "3.0.4",
+        "react": "^16.13.1"
+      },
+      "dependencies": {
+        "@babel/runtime": {
+          "version": "7.10.5",
+          "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.10.5.tgz",
+          "integrity": "sha512-otddXKhdNn7d0ptoFRHtMLa8LqDxLYwTjB4nYgM1yy5N6gU/MUf8zqyyLltCH3yAVitBzmwK4us+DD0l/MauAg==",
+          "requires": {
+            "regenerator-runtime": "^0.13.4"
+          }
+        },
+        "react": {
+          "version": "16.13.1",
+          "resolved": "https://registry.npmjs.org/react/-/react-16.13.1.tgz",
+          "integrity": "sha512-YMZQQq32xHLX0bz5Mnibv1/LHb3Sqzngu7xstSM+vrkE5Kzr9xE0yMByK5kMoTK30YVJE61WfbxIFFvfeDKT1w==",
+          "requires": {
+            "loose-envify": "^1.1.0",
+            "object-assign": "^4.1.1",
+            "prop-types": "^15.6.2"
+          }
+        },
+        "regenerator-runtime": {
+          "version": "0.13.5",
+          "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.5.tgz",
+          "integrity": "sha512-ZS5w8CpKFinUzOwW3c83oPeVXoNsrLsaCoLtJvAClH135j/R77RuymhiSErhm2lKcwSCIpmvIWSbDkIfAqKQlA=="
+        }
+      }
+    },
     "gatsby-plugin-react-helmet": {
       "version": "3.0.6",
       "resolved": "https://registry.npmjs.org/gatsby-plugin-react-helmet/-/gatsby-plugin-react-helmet-3.0.6.tgz",
@@ -10198,7 +10253,8 @@
     "lodash.deburr": {
       "version": "4.1.0",
       "resolved": "https://registry.npmjs.org/lodash.deburr/-/lodash.deburr-4.1.0.tgz",
-      "integrity": "sha1-3bG7s+8HRYwBd7oH3hRCLLAz/5s="
+      "integrity": "sha1-3bG7s+8HRYwBd7oH3hRCLLAz/5s=",
+      "dev": true
     },
     "lodash.defaults": {
       "version": "4.2.0",
diff --git a/website/package.json b/website/package.json
index f43b9a6a0..a59bc9bdc 100644
--- a/website/package.json
+++ b/website/package.json
@@ -23,6 +23,7 @@
         "gatsby-plugin-google-analytics": "^2.0.14",
         "gatsby-plugin-manifest": "^2.0.17",
         "gatsby-plugin-offline": "^2.0.24",
+        "gatsby-plugin-plausible": "0.0.6",
         "gatsby-plugin-react-helmet": "^3.0.6",
         "gatsby-plugin-react-svg": "^2.0.0",
         "gatsby-plugin-sass": "^2.0.10",

From cd5af72c9af469bd55bcb4bc27a94db61c448919 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 19 Jul 2020 11:09:49 +0200
Subject: [PATCH 38/43] Update pkuseg version (#5774)

* Update pkuseg version in Chinese tokenizer warnings
* Update pkuseg version in `Makefile`
* Remove warning about python3.8 wheels in docs
---
 Makefile                     |  4 ++--
 spacy/lang/zh/__init__.py    |  2 +-
 website/docs/usage/models.md | 12 ------------
 3 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 865bf44c5..6c0a59ba8 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
 version := $(shell "bin/get-version.sh")
 
 dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
-	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core
+	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core
 	chmod a+rx $@
 	cp $@ dist/spacy.pex
 
@@ -15,7 +15,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl
 
 wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
 	$(VENV)/bin/pip wheel . -w ./wheelhouse
-	$(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core -w ./wheelhouse
+	$(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core -w ./wheelhouse
 	touch $@
 
 wheelhouse/pytest-%.whl : $(VENV)/bin/pex
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 9d1cb71a7..9f8a82c10 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -16,7 +16,7 @@ from .tag_map import TAG_MAP
 from ... import util
 
 
-_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
+_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
 
 
 def try_jieba_import(use_jieba):
diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index b11e6347a..cc65dad68 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -117,18 +117,6 @@ The Chinese language class supports three word segmentation options:
    better segmentation for Chinese OntoNotes and the new
    [Chinese models](/models/zh).
 
-<Infobox variant="warning">
-
-Note that [`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship
-with pre-compiled wheels for Python 3.8. If you're running Python 3.8, you can
-install it from our fork and compile it locally:
-
-```bash
-$ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip
-```
-
-</Infobox>
-
 <Accordion title="Details on spaCy's PKUSeg API">
 
 The `meta` argument of the `Chinese` language class supports the following

From 7e142720962e11fd62396721d7826ff2406c336c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 19 Jul 2020 11:10:11 +0200
Subject: [PATCH 39/43] Lower upper pin for cupy to 8.0.0 (#5773)

---
 setup.cfg | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index e556ba19c..9bd45d45d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -61,21 +61,21 @@ install_requires =
 lookups =
     spacy_lookups_data>=0.3.2,<0.4.0
 cuda =
-    cupy>=5.0.0b4,<9.0.0
+    cupy>=5.0.0b4,<8.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<9.0.0
+    cupy-cuda80>=5.0.0b4,<8.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<9.0.0
+    cupy-cuda90>=5.0.0b4,<8.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<9.0.0
+    cupy-cuda91>=5.0.0b4,<8.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<9.0.0
+    cupy-cuda92>=5.0.0b4,<8.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<9.0.0
+    cupy-cuda100>=5.0.0b4,<8.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<9.0.0
+    cupy-cuda101>=5.0.0b4,<8.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<9.0.0
+    cupy-cuda102>=5.0.0b4,<8.0.0
 # Language tokenizers with external dependencies
 ja =
     sudachipy>=0.4.5

From 597bcc629e173dfd87422188dc76a2f1053a9bba Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 19 Jul 2020 11:13:39 +0200
Subject: [PATCH 40/43] Improve tag map initialization and updating (#5768)

* Improve tag map initialization and updating

Generalize tag map initialization and updating so that a provided tag
map can be loaded correctly in the CLI.

* normalize provided tag map as necessary
* use the same method for initializing and overwriting the tag map

* Reinitialize cache after loading new tag map

Reinitialize the cache with the right size after loading a new tag map.
---
 spacy/cli/debug_data.py |  4 ++--
 spacy/cli/train.py      |  4 ++--
 spacy/morphology.pyx    | 31 ++++++++++++++++---------------
 3 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 7a4a093e2..22540c779 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -70,8 +70,8 @@ def debug_data(
     else:
         lang_cls = get_lang_class(lang)
         nlp = lang_cls()
-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
+    # Replace tag map with provided mapping
+    nlp.vocab.morphology.load_tag_map(tag_map)
 
     msg.divider("Data format validation")
 
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index b81214b95..e24aa8a95 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -250,8 +250,8 @@ def train(
                 pipe_cfg = {}
             nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
 
-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
+    # Replace tag map with provided mapping
+    nlp.vocab.morphology.load_tag_map(tag_map)
 
     # Create empty extra lexeme tables so the data from spacy-lookups-data
     # isn't loaded if these features are accessed
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index a9bab38ed..18bba0124 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -150,6 +150,19 @@ cdef class Morphology:
         self.mem = Pool()
         self.strings = string_store
         self.tags = PreshMap()
+        self._feat_map = MorphologyClassMap(FEATURES)
+        self.load_tag_map(tag_map)
+        self.lemmatizer = lemmatizer
+
+        self._cache = PreshMapArray(self.n_tags)
+        self.exc = {}
+        if exc is not None:
+            for (tag, orth), attrs in exc.items():
+                attrs = _normalize_props(attrs)
+                self.add_special_case(
+                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
+
+    def load_tag_map(self, tag_map):
         # Add special space symbol. We prefix with underscore, to make sure it
         # always sorts to the end.
         if '_SP' in tag_map:
@@ -160,29 +173,17 @@ cdef class Morphology:
             self.strings.add('_SP')
             tag_map = dict(tag_map)
             tag_map['_SP'] = space_attrs
-        self.tag_names = tuple(sorted(tag_map.keys()))
         self.tag_map = {}
-        self.lemmatizer = lemmatizer
-        self.n_tags = len(tag_map)
         self.reverse_index = {}
-        self._feat_map = MorphologyClassMap(FEATURES)
-        self._load_from_tag_map(tag_map)
-
-        self._cache = PreshMapArray(self.n_tags)
-        self.exc = {}
-        if exc is not None:
-            for (tag, orth), attrs in exc.items():
-                attrs = _normalize_props(attrs)
-                self.add_special_case(
-                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
-
-    def _load_from_tag_map(self, tag_map):
         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
             attrs = _normalize_props(attrs)
             self.add({self._feat_map.id2feat[feat] for feat in attrs
                       if feat in self._feat_map.id2feat})
             self.tag_map[tag_str] = dict(attrs)
             self.reverse_index[self.strings.add(tag_str)] = i
+        self.tag_names = tuple(sorted(self.tag_map.keys()))
+        self.n_tags = len(self.tag_map)
+        self._cache = PreshMapArray(self.n_tags)
 
     def __reduce__(self):
         return (Morphology, (self.strings, self.tag_map, self.lemmatizer,

From a8978ca285fa7ebf0867f54723a6ba5569b1c156 Mon Sep 17 00:00:00 2001
From: Alec Chapman <abchapman93@gmail.com>
Date: Sun, 19 Jul 2020 06:35:31 -0500
Subject: [PATCH 41/43] Add VA COVID-19 NLP project to spaCy Universe (#5777)

* Update universe.json

Add cov-bsv to "resources"

* Update universe.json

* add contributor agreement
---
 .github/contributors/abchapman93.md | 106 ++++++++++++++++++++++++++++
 website/meta/universe.json          |  24 +++++++
 2 files changed, 130 insertions(+)
 create mode 100644 .github/contributors/abchapman93.md

diff --git a/.github/contributors/abchapman93.md b/.github/contributors/abchapman93.md
new file mode 100644
index 000000000..5af0cb873
--- /dev/null
+++ b/.github/contributors/abchapman93.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [X] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Alec Chapman         |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 7/17/2020            |
+| GitHub username                | abchapman93          |
+| Website (optional)             |                      |
diff --git a/website/meta/universe.json b/website/meta/universe.json
index c5eb96e43..e832b511f 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2453,6 +2453,30 @@
                 "website": "https://besomi.ai"
             },
             "category": ["standalone"]
+        },
+        {
+            "id": "cov-bsv",
+            "title": "VA COVID-19 NLP BSV",
+            "slogan": "spaCy pipeline for COVID-19 surveillance.",
+            "github": "abchapman93/VA_COVID-19_NLP_BSV",
+            "description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
+            "pip": "cov-bsv",
+            "code_example": [
+              "import cov_bsv",
+              "",
+              "nlp = cov_bsv.load()",
+              "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
+              "",
+              "print(doc.ents)",
+              "print(doc._.cov_classification)",
+              "cov_bsv.visualize_doc(doc)"
+            ],
+            "category": ["pipeline", "standalone", "biomedical", "scientific"],
+            "tags": ["clinical", "epidemiology", "covid-19", "surveillance"],
+            "author": "Alec Chapman",
+            "author_links": {
+                "github": "abchapman93"
+            }
         }
     ],
 

From e6967ca98a9f39d196031aff59e7a2dae1033641 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 20 Jul 2020 14:59:41 +0200
Subject: [PATCH 42/43] Revert cupy-cuda version update

---
 setup.cfg | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 53a359247..a1c881d10 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -65,21 +65,21 @@ console_scripts =
 lookups =
     spacy_lookups_data>=0.3.2,<0.4.0
 cuda =
-    cupy>=5.0.0b4,<8.0.0
+    cupy>=5.0.0b4,<9.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<8.0.0
+    cupy-cuda80>=5.0.0b4,<9.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<8.0.0
+    cupy-cuda90>=5.0.0b4,<9.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<8.0.0
+    cupy-cuda91>=5.0.0b4,<9.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<8.0.0
+    cupy-cuda92>=5.0.0b4,<9.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<8.0.0
+    cupy-cuda100>=5.0.0b4,<9.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<8.0.0
+    cupy-cuda101>=5.0.0b4,<9.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<8.0.0
+    cupy-cuda102>=5.0.0b4,<9.0.0
 # Language tokenizers with external dependencies
 ja =
     sudachipy>=0.4.5

From d51db72e461261ddc74e70abd7e2f745610a4408 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 20 Jul 2020 15:01:36 +0200
Subject: [PATCH 43/43] Remove Python 2 marker

---
 spacy/tests/regression/test_issue3001-3500.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 4219da40f..ca4733f0d 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -121,7 +121,6 @@ def test_issue3248_1():
     assert len(matcher) == 2
 
 
-@pytest.mark.skipif(is_python2, reason="Can't pickle instancemethod for is_base_form")
 def test_issue3248_2():
     """Test that the PhraseMatcher can be pickled correctly."""
     nlp = English()