From 81d3a1edb11a914e8854193137900d3cd80ad3dd Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 27 Jul 2021 12:07:01 +0200
Subject: [PATCH 001/133] Use tokenizer URL_MATCH pattern in LIKE_URL (#8765)

---
 spacy/lang/lex_attrs.py        | 3 +++
 spacy/tests/lang/test_attrs.py | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py
index 12016c273..6ed981a06 100644
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@@ -3,6 +3,7 @@ import unicodedata
 import re
 
 from .. import attrs
+from .tokenizer_exceptions import URL_MATCH
 
 
 _like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
@@ -109,6 +110,8 @@ def like_url(text: str) -> bool:
         return True
     if tld.isalpha() and tld in _tlds:
         return True
+    if URL_MATCH(text):
+        return True
     return False
 
 
diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py
index b39109455..6a7a404fd 100644
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match):
         ("www.google.com", True),
         ("google.com", True),
         ("sydney.com", True),
-        ("2girls1cup.org", True),
+        ("1abc2def.org", True),
         ("http://stupid", True),
         ("www.hi", True),
+        ("example.com/example", True),
         ("dog", False),
         ("1.2", False),
         ("1.a", False),

From 733ffe439d0925f8ba92048ad2250c8107262d82 Mon Sep 17 00:00:00 2001
From: Dimitar Ganev <d.ganev@protonmail.ch>
Date: Tue, 10 Aug 2021 14:44:23 +0300
Subject: [PATCH 002/133] Improve the stop words and the tokenizer exceptions
 in Bulgarian language. (#8862)

* Add more stop words and Improve the readability

* Add and categorize the tokenizer exceptions for `bg` lang

* Create syrull.md

* Add references for the additional stop words and tokenizer exc abbrs
---
 .github/contributors/syrull.md        | 106 ++++++++
 spacy/lang/bg/stop_words.py           | 334 ++++++--------------------
 spacy/lang/bg/tokenizer_exceptions.py | 182 ++++++++++++--
 3 files changed, 344 insertions(+), 278 deletions(-)
 create mode 100644 .github/contributors/syrull.md

diff --git a/.github/contributors/syrull.md b/.github/contributors/syrull.md
new file mode 100644
index 000000000..82cdade12
--- /dev/null
+++ b/.github/contributors/syrull.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Dimitar Ganev |
+| Company name (if applicable)   |  |
+| Title or role (if applicable)  |  |
+| Date                           | 2021/8/2 |
+| GitHub username                | syrull |
+| Website (optional)             |                      |
diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py
index aae7692a2..df708b65e 100644
--- a/spacy/lang/bg/stop_words.py
+++ b/spacy/lang/bg/stop_words.py
@@ -1,265 +1,79 @@
-# Source: https://github.com/Alir3z4/stop-words
-
+"""
+References:
+    https://github.com/Alir3z4/stop-words - Original list, serves as a base.
+    https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
+"""
 STOP_WORDS = set(
     """
-а
-автентичен
-аз
-ако
-ала
-бе
-без
-беше
-би
-бивш
-бивша
-бившо
-бил
-била
-били
-било
-благодаря
-близо
-бъдат
-бъде
-бяха
-в
-вас
-ваш
-ваша
-вероятно
-вече
-взема
-ви
-вие
-винаги
-внимава
-време
-все
-всеки
-всички
-всичко
-всяка
-във
-въпреки
-върху
-г
-ги
-главен
-главна
-главно
-глас
-го
-година
-години
-годишен
-д
-да
-дали
-два
-двама
-двамата
-две
-двете
-ден
-днес
-дни
-до
-добра
-добре
-добро
-добър
-докато
-докога
-дори
-досега
-доста
-друг
-друга
-други
-е
-евтин
-едва
-един
-една
-еднаква
-еднакви
-еднакъв
-едно
-екип
-ето
-живот
-за
-забавям
-зад
-заедно
-заради
-засега
-заспал
-затова
-защо
-защото
-и
-из
-или
-им
-има
-имат
-иска
-й
-каза
-как
-каква
-какво
-както
-какъв
-като
-кога
-когато
-което
-които
-кой
-който
-колко
-която
-къде
-където
-към
-лесен
-лесно
-ли
-лош
-м
-май
-малко
-ме
-между
-мек
-мен
-месец
-ми
-много
-мнозина
-мога
-могат
-може
-мокър
-моля
-момента
-му
-н
-на
-над
-назад
-най
-направи
-напред
-например
-нас
-не
-него
-нещо
-нея
-ни
-ние
-никой
-нито
-нищо
-но
-нов
-нова
-нови
-новина
-някои
-някой
-няколко
-няма
-обаче
-около
-освен
-особено
-от
-отгоре
-отново
-още
-пак
-по
-повече
-повечето
-под
-поне
-поради
-после
-почти
-прави
-пред
-преди
-през
-при
-пък
-първата
-първи
-първо
-пъти
-равен
-равна
-с
-са
-сам
-само
-се
-сега
-си
-син
-скоро
-след
-следващ
-сме
-смях
-според
-сред
-срещу
-сте
-съм
-със
-също
-т
-тази
-така
-такива
-такъв
-там
-твой
-те
-тези
-ти
-т.н.
-то
-това
-тогава
-този
-той
-толкова
-точно
-три
-трябва
-тук
-тъй
-тя
-тях
-у
-утре
-харесва
-хиляди
-ч
-часа
-че
-често
-чрез
-ще
-щом
+а автентичен аз ако ала
+
+бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат
+бъде бъда бяха
+
+в вас ваш ваша вашата вашият вероятно вече взема ви вие винаги внимава време все 
+всеки всички вместо всичко вследствие всъщност всяка втори във въпреки върху
+вътре веднъж 
+
+г ги главен главна главно глас го годно година години годишен
+
+д да дали далеч далече два двама двамата две двете ден днес дни до добра добре 
+добро добър достатъчно докато докога дори досега доста друг друга другаде други
+
+е евтин едва един една еднаква еднакви еднакъв едно екип ето
+
+живот жив
+
+за здравей здрасти знае зная забавям зад зададени заедно заради засега заспал 
+затова запазва започвам защо защото завинаги
+
+и из или им има имат иска искам използвайки изглежда изглеждаше изглеждайки 
+извън имайки
+
+й йо 
+
+каза казва казвайки казвам как каква какво както какъв като кога кауза каузи 
+когато когото което които кой който колко която къде където към край кратък 
+кръгъл
+
+лесен лесно ли летя летиш летим лош
+
+м май малко макар малцина междувременно минус ме между мек мен месец ми мис 
+мисля много мнозина мога могат може мой можем мокър моля момента му
+
+н на над назад най наш навсякъде навътре нагоре направи напред надолу наистина 
+например наопаки наполовина напоследък нека независимо нас насам наскоро 
+настрана необходимо него негов нещо нея ни ние никой нито нищо но нов някак нова 
+нови новина някои някой някога някъде няколко няма
+
+о обаче около описан опитах опитва опитвайки опитвам определен определено освен 
+обикновено осигурява обратно означава особен особено от ох отвъд отгоре отдолу 
+отново отива отивам отидох отсега отделно отколкото откъдето очевидно оттам 
+относно още
+
+п пак по повече повечето под поне просто пряко поради после последен последно 
+посочен почти прави прав прави правя пред преди през при пък първата първи първо 
+път пъти плюс
+
+равен равна различен различни разумен разумно
+
+с са сам само себе сериозно сигурен сигурно се сега си син скоро скорошен след 
+следващ следващия следва следното следователно случва сме смях собствен 
+сравнително смея според сред става срещу съвсем съдържа съдържащ съжалявам 
+съответен съответно сте съм със също
+
+т така техен техни такива такъв твърде там трета твой те тези ти то това 
+тогава този той търси толкова точно три трябва тук тъй тя тях
+
+у утре ужасно употреба успоредно уточнен уточняване
+
+харесва харесали хиляди
+
+ч часа ценя цяло цялостен че често чрез чудя
+
+ще щеше щом щяха
+
 юмрук
-я
-як
+
+я як
 """.split()
 )
diff --git a/spacy/lang/bg/tokenizer_exceptions.py b/spacy/lang/bg/tokenizer_exceptions.py
index 0b7487c64..0f484b778 100644
--- a/spacy/lang/bg/tokenizer_exceptions.py
+++ b/spacy/lang/bg/tokenizer_exceptions.py
@@ -1,10 +1,16 @@
+"""
+References:
+    https://slovored.com/bg/abbr/grammar/ - Additional refs for abbreviations
+    (countries, occupations, fields of studies and more).
+"""
+
 from ...symbols import ORTH, NORM
 
 
 _exc = {}
 
-
-_abbr_exc = [
+# measurements
+for abbr in [
     {ORTH: "м", NORM: "метър"},
     {ORTH: "мм", NORM: "милиметър"},
     {ORTH: "см", NORM: "сантиметър"},
@@ -17,51 +23,191 @@ _abbr_exc = [
     {ORTH: "хл", NORM: "хектолиър"},
     {ORTH: "дкл", NORM: "декалитър"},
     {ORTH: "л", NORM: "литър"},
-]
-for abbr in _abbr_exc:
+]:
     _exc[abbr[ORTH]] = [abbr]
 
-_abbr_line_exc = [
+# line abbreviations
+for abbr in [
     {ORTH: "г-жа", NORM: "госпожа"},
     {ORTH: "г-н", NORM: "господин"},
     {ORTH: "г-ца", NORM: "госпожица"},
     {ORTH: "д-р", NORM: "доктор"},
     {ORTH: "о-в", NORM: "остров"},
     {ORTH: "п-в", NORM: "полуостров"},
-]
-
-for abbr in _abbr_line_exc:
+    {ORTH: "с-у", NORM: "срещу"},
+    {ORTH: "в-у", NORM: "върху"},
+    {ORTH: "м-у", NORM: "между"},
+]:
     _exc[abbr[ORTH]] = [abbr]
 
-_abbr_dot_exc = [
+# foreign language related abbreviations
+for abbr in [
+    {ORTH: "англ.", NORM: "английски"},
+    {ORTH: "ан.", NORM: "английски термин"},
+    {ORTH: "араб.", NORM: "арабски"},
+    {ORTH: "афр.", NORM: "африкански"},
+    {ORTH: "гр.", NORM: "гръцки"},
+    {ORTH: "лат.", NORM: "латински"},
+    {ORTH: "рим.", NORM: "римски"},
+    {ORTH: "старогр.", NORM: "старогръцки"},
+    {ORTH: "староевр.", NORM: "староеврейски"},
+    {ORTH: "фр.", NORM: "френски"},
+    {ORTH: "хол.", NORM: "холандски"},
+    {ORTH: "швед.", NORM: "шведски"},
+    {ORTH: "шотл.", NORM: "шотландски"},
+    {ORTH: "яп.", NORM: "японски"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+# profession and academic titles abbreviations
+for abbr in [
     {ORTH: "акад.", NORM: "академик"},
-    {ORTH: "ал.", NORM: "алинея"},
     {ORTH: "арх.", NORM: "архитект"},
+    {ORTH: "инж.", NORM: "инженер"},
+    {ORTH: "канц.", NORM: "канцлер"},
+    {ORTH: "проф.", NORM: "професор"},
+    {ORTH: "св.", NORM: "свети"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+# fields of studies
+for abbr in [
+    {ORTH: "агр.", NORM: "агрономия"},
+    {ORTH: "ав.", NORM: "авиация"},
+    {ORTH: "агр.", NORM: "агрономия"},
+    {ORTH: "археол.", NORM: "археология"},
+    {ORTH: "астр.", NORM: "астрономия"},
+    {ORTH: "геод.", NORM: "геодезия"},
+    {ORTH: "геол.", NORM: "геология"},
+    {ORTH: "геом.", NORM: "геометрия"},
+    {ORTH: "гимн.", NORM: "гимнастика"},
+    {ORTH: "грам.", NORM: "граматика"},
+    {ORTH: "жур.", NORM: "журналистика"},
+    {ORTH: "журн.", NORM: "журналистика"},
+    {ORTH: "зем.", NORM: "земеделие"},
+    {ORTH: "икон.", NORM: "икономика"},
+    {ORTH: "лит.", NORM: "литература"},
+    {ORTH: "мат.", NORM: "математика"},
+    {ORTH: "мед.", NORM: "медицина"},
+    {ORTH: "муз.", NORM: "музика"},
+    {ORTH: "печ.", NORM: "печатарство"},
+    {ORTH: "пол.", NORM: "политика"},
+    {ORTH: "псих.", NORM: "психология"},
+    {ORTH: "соц.", NORM: "социология"},
+    {ORTH: "стат.", NORM: "статистика"},
+    {ORTH: "стил.", NORM: "стилистика"},
+    {ORTH: "топогр.", NORM: "топография"},
+    {ORTH: "търг.", NORM: "търговия"},
+    {ORTH: "фарм.", NORM: "фармацевтика"},
+    {ORTH: "фехт.", NORM: "фехтовка"},
+    {ORTH: "физиол.", NORM: "физиология"},
+    {ORTH: "физ.", NORM: "физика"},
+    {ORTH: "фил.", NORM: "философия"},
+    {ORTH: "фин.", NORM: "финанси"},
+    {ORTH: "фолкл.", NORM: "фолклор"},
+    {ORTH: "фон.", NORM: "фонетика"},
+    {ORTH: "фот.", NORM: "фотография"},
+    {ORTH: "футб.", NORM: "футбол"},
+    {ORTH: "хим.", NORM: "химия"},
+    {ORTH: "хир.", NORM: "хирургия"},
+    {ORTH: "ел.", NORM: "електротехника"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+for abbr in [
+    {ORTH: "ал.", NORM: "алинея"},
+    {ORTH: "авт.", NORM: "автоматично"},
+    {ORTH: "адм.", NORM: "администрация"},
+    {ORTH: "арт.", NORM: "артилерия"},
     {ORTH: "бл.", NORM: "блок"},
     {ORTH: "бр.", NORM: "брой"},
     {ORTH: "бул.", NORM: "булевард"},
+    {ORTH: "букв.", NORM: "буквално"},
     {ORTH: "в.", NORM: "век"},
+    {ORTH: "вр.", NORM: "време"},
+    {ORTH: "вм.", NORM: "вместо"},
+    {ORTH: "воен.", NORM: "военен термин"},
     {ORTH: "г.", NORM: "година"},
     {ORTH: "гр.", NORM: "град"},
+    {ORTH: "гл.", NORM: "глагол"},
+    {ORTH: "др.", NORM: "други"},
+    {ORTH: "ез.", NORM: "езеро"},
     {ORTH: "ж.р.", NORM: "женски род"},
-    {ORTH: "инж.", NORM: "инженер"},
+    {ORTH: "жп.", NORM: "железопът"},
+    {ORTH: "застр.", NORM: "застрахователно дело"},
+    {ORTH: "знач.", NORM: "значение"},
+    {ORTH: "и др.", NORM: "и други"},
+    {ORTH: "и под.", NORM: "и подобни"},
+    {ORTH: "и пр.", NORM: "и прочие"},
+    {ORTH: "изр.", NORM: "изречение"},
+    {ORTH: "изт.", NORM: "източен"},
+    {ORTH: "конкр.", NORM: "конкретно"},
     {ORTH: "лв.", NORM: "лев"},
+    {ORTH: "л.", NORM: "лице"},
     {ORTH: "м.р.", NORM: "мъжки род"},
-    {ORTH: "мат.", NORM: "математика"},
-    {ORTH: "мед.", NORM: "медицина"},
+    {ORTH: "мин.вр.", NORM: "минало време"},
+    {ORTH: "мн.ч.", NORM: "множествено число"},
+    {ORTH: "напр.", NORM: "например"},
+    {ORTH: "нар.", NORM: "наречие"},
+    {ORTH: "науч.", NORM: "научен термин"},
+    {ORTH: "непр.", NORM: "неправилно"},
+    {ORTH: "обик.", NORM: "обикновено"},
+    {ORTH: "опред.", NORM: "определение"},
+    {ORTH: "особ.", NORM: "особено"},
+    {ORTH: "ост.", NORM: "остаряло"},
+    {ORTH: "относ.", NORM: "относително"},
+    {ORTH: "отр.", NORM: "отрицателно"},
     {ORTH: "пл.", NORM: "площад"},
-    {ORTH: "проф.", NORM: "професор"},
+    {ORTH: "пад.", NORM: "падеж"},
+    {ORTH: "парл.", NORM: "парламентарен"},
+    {ORTH: "погов.", NORM: "поговорка"},
+    {ORTH: "пон.", NORM: "понякога"},
+    {ORTH: "правосл.", NORM: "православен"},
+    {ORTH: "прибл.", NORM: "приблизително"},
+    {ORTH: "прил.", NORM: "прилагателно име"},
+    {ORTH: "пр.", NORM: "прочие"},
     {ORTH: "с.", NORM: "село"},
     {ORTH: "с.р.", NORM: "среден род"},
-    {ORTH: "св.", NORM: "свети"},
     {ORTH: "сп.", NORM: "списание"},
     {ORTH: "стр.", NORM: "страница"},
+    {ORTH: "сз.", NORM: "съюз"},
+    {ORTH: "сег.", NORM: "сегашно"},
+    {ORTH: "сп.", NORM: "спорт"},
+    {ORTH: "срв.", NORM: "сравни"},
+    {ORTH: "с.ст.", NORM: "селскостопанска техника"},
+    {ORTH: "счет.", NORM: "счетоводство"},
+    {ORTH: "съкр.", NORM: "съкратено"},
+    {ORTH: "съобщ.", NORM: "съобщение"},
+    {ORTH: "същ.", NORM: "съществително"},
+    {ORTH: "текст.", NORM: "текстилен"},
+    {ORTH: "телев.", NORM: "телевизия"},
+    {ORTH: "тел.", NORM: "телефон"},
+    {ORTH: "т.е.", NORM: "тоест"},
+    {ORTH: "т.н.", NORM: "така нататък"},
+    {ORTH: "т.нар.", NORM: "така наречен"},
+    {ORTH: "търж.", NORM: "тържествено"},
     {ORTH: "ул.", NORM: "улица"},
+    {ORTH: "уч.", NORM: "училище"},
+    {ORTH: "унив.", NORM: "университет"},
+    {ORTH: "харт.", NORM: "хартия"},
+    {ORTH: "хидр.", NORM: "хидравлика"},
+    {ORTH: "хран.", NORM: "хранителна"},
+    {ORTH: "църк.", NORM: "църковен термин"},
+    {ORTH: "числ.", NORM: "числително"},
     {ORTH: "чл.", NORM: "член"},
-]
-
-for abbr in _abbr_dot_exc:
+    {ORTH: "ч.", NORM: "число"},
+    {ORTH: "числ.", NORM: "числително"},
+    {ORTH: "шахм.", NORM: "шахмат"},
+    {ORTH: "шах.", NORM: "шахмат"},
+    {ORTH: "юр.", NORM: "юридически"},
+]:
     _exc[abbr[ORTH]] = [abbr]
 
+# slash abbreviations
+for abbr in [
+    {ORTH: "м/у", NORM: "между"},
+    {ORTH: "с/у", NORM: "срещу"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
 
 TOKENIZER_EXCEPTIONS = _exc

From ee011ca96341405ac3c78471f5f45293437fc9ee Mon Sep 17 00:00:00 2001
From: fgaim <fgaim@users.noreply.github.com>
Date: Tue, 10 Aug 2021 13:55:08 +0200
Subject: [PATCH 003/133] =?UTF-8?q?Update=20Tigrinya=20=E1=89=B5=E1=8C=8D?=
 =?UTF-8?q?=E1=88=AD=E1=8A=9B=20language=20support=20(#8900)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add missing punctuation for Tigrinya and Amharic

* Fix numeral and ordinal numbers for Tigrinya

 - Amharic was used in many cases
 - Also fixed some typos

* Update Tigrinya stop-words

* Contributor agreement for fgaim

* Fix typo in "ti" lang test

* Remove multi-word entries from numbers and ordinals
---
 .github/contributors/fgaim.md    | 106 +++++++++++++++++++++++++++++++
 spacy/lang/am/punctuation.py     |   2 +-
 spacy/lang/ti/lex_attrs.py       |  53 ++++------------
 spacy/lang/ti/punctuation.py     |   2 +-
 spacy/lang/ti/stop_words.py      |  23 ++++++-
 spacy/tests/lang/ti/test_text.py |   2 +-
 6 files changed, 143 insertions(+), 45 deletions(-)
 create mode 100644 .github/contributors/fgaim.md

diff --git a/.github/contributors/fgaim.md b/.github/contributors/fgaim.md
new file mode 100644
index 000000000..1c3b409b4
--- /dev/null
+++ b/.github/contributors/fgaim.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Fitsum Gaim          |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2021-08-07           |
+| GitHub username                | fgaim                |
+| Website (optional)             |                      |
diff --git a/spacy/lang/am/punctuation.py b/spacy/lang/am/punctuation.py
index 70af12039..555a179fa 100644
--- a/spacy/lang/am/punctuation.py
+++ b/spacy/lang/am/punctuation.py
@@ -1,7 +1,7 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 from ..char_classes import UNITS, ALPHA_UPPER
 
-_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
+_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
 
 _suffixes = (
     _list_punct
diff --git a/spacy/lang/ti/lex_attrs.py b/spacy/lang/ti/lex_attrs.py
index ed094de3b..b29bd8c96 100644
--- a/spacy/lang/ti/lex_attrs.py
+++ b/spacy/lang/ti/lex_attrs.py
@@ -2,7 +2,7 @@ from ...attrs import LIKE_NUM
 
 _num_words = [
     "ዜሮ",
-    "ሐደ",
+    "ሓደ",
     "ክልተ",
     "ሰለስተ",
     "ኣርባዕተ",
@@ -11,66 +11,37 @@ _num_words = [
     "ሸውዓተ",
     "ሽሞንተ",
     "ትሽዓተ",
-    "ኣሰርተ",
-    "ኣሰርተ ሐደ",
-    "ኣሰርተ ክልተ",
-    "ኣሰርተ ሰለስተ",
-    "ኣሰርተ ኣርባዕተ",
-    "ኣሰርተ ሓሙሽተ",
-    "ኣሰርተ ሽድሽተ",
-    "ኣሰርተ ሸውዓተ",
-    "ኣሰርተ ሽሞንተ",
-    "ኣሰርተ ትሽዓተ",
+    "ዓሰርተ",
     "ዕስራ",
     "ሰላሳ",
     "ኣርብዓ",
-    "ሃምሳ",
-    "ስልሳ",
+    "ሓምሳ",
+    "ሱሳ",
     "ሰብዓ",
     "ሰማንያ",
-    "ተስዓ",
+    "ቴስዓ",
     "ሚእቲ",
     "ሺሕ",
     "ሚልዮን",
     "ቢልዮን",
     "ትሪልዮን",
     "ኳድሪልዮን",
-    "ገጅልዮን",
-    "ባዝልዮን",
+    "ጋዚልዮን",
+    "ባዚልዮን"
 ]
 
+# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
 _ordinal_words = [
     "ቀዳማይ",
     "ካልኣይ",
     "ሳልሳይ",
-    "ራብኣይ",
+    "ራብዓይ",
     "ሓምሻይ",
     "ሻድሻይ",
     "ሻውዓይ",
     "ሻምናይ",
-    "ዘጠነኛ",
-    "አስረኛ",
-    "ኣሰርተ አንደኛ",
-    "ኣሰርተ ሁለተኛ",
-    "ኣሰርተ ሶስተኛ",
-    "ኣሰርተ አራተኛ",
-    "ኣሰርተ አምስተኛ",
-    "ኣሰርተ ስድስተኛ",
-    "ኣሰርተ ሰባተኛ",
-    "ኣሰርተ ስምንተኛ",
-    "ኣሰርተ ዘጠነኛ",
-    "ሃያኛ",
-    "ሰላሳኛ" "አርባኛ",
-    "አምሳኛ",
-    "ስድሳኛ",
-    "ሰባኛ",
-    "ሰማንያኛ",
-    "ዘጠናኛ",
-    "መቶኛ",
-    "ሺኛ",
-    "ሚሊዮንኛ",
-    "ቢሊዮንኛ",
-    "ትሪሊዮንኛ",
+    "ታሽዓይ",
+    "ዓስራይ"
 ]
 
 
@@ -92,7 +63,7 @@ def like_num(text):
     # Check ordinal number
     if text_lower in _ordinal_words:
         return True
-    if text_lower.endswith("ኛ"):
+    if text_lower.endswith("ይ"):
         if text_lower[:-2].isdigit():
             return True
 
diff --git a/spacy/lang/ti/punctuation.py b/spacy/lang/ti/punctuation.py
index 772b009bf..aa884c2ba 100644
--- a/spacy/lang/ti/punctuation.py
+++ b/spacy/lang/ti/punctuation.py
@@ -1,7 +1,7 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 from ..char_classes import UNITS, ALPHA_UPPER
 
-_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
+_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
 
 _suffixes = (
     _list_punct
diff --git a/spacy/lang/ti/stop_words.py b/spacy/lang/ti/stop_words.py
index c4f8f20fa..9bd712200 100644
--- a/spacy/lang/ti/stop_words.py
+++ b/spacy/lang/ti/stop_words.py
@@ -1,6 +1,27 @@
+# Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt
+
 # Stop words
 STOP_WORDS = set(
     """
-ግን ግና ንስኻ ንስኺ ንስኻትክን ንስኻትኩም ናትካ ናትኪ ናትክን ናትኩም
+'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን
+ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል
+ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም
+ስለ ስለዚ ስለዝበላ ሽዑ ቅድሚ በለ በቲ በዚ ብምባል ብተወሳኺ ብኸመይ
+ብዘይ ብዘይካ ብዙሕ ብዛዕባ ብፍላይ ተባሂሉ ነበረ ነቲ ነታ ነቶም
+ነዚ ነይሩ ነገራት ነገር ናብ ናብቲ ናትኩም ናትኪ ናትካ ናትክን
+ናይ ናይቲ ንሕና ንሱ ንሳ ንሳቶም ንስኺ ንስኻ ንስኻትኩም ንስኻትክን ንዓይ
+ኢለ ኢሉ ኢላ ኢልካ ኢሎም ኢና ኢኻ ኢዩ ኣለኹ
+ኣለዉ ኣለዎ ኣሎ ኣብ ኣብቲ ኣብታ ኣብኡ ኣብዚ ኣነ ኣዝዩ ኣይኮነን ኣይኰነን
+እምበር እሞ እተን እቲ እታ እቶም እንተ እንተሎ
+ኣላ እንተኾነ እንታይ እንከሎ እኳ እዋን እውን እዚ እዛ እዞም
+እየ እየን እዩ እያ እዮም
+ከሎ ከመይ ከም ከምቲ ከምኡ ከምዘሎ
+ከምዚ ከኣ ኩሉ ካልእ ካልኦት ካብ ካብቲ ካብቶም ክሳብ ክሳዕ ክብል
+ክንደይ ክንዲ ክኸውን ኮይኑ ኰይኑ ኵሉ ኸም ኸኣ ወይ
+ዋላ ዘለና ዘለዉ ዘለዋ ዘለዎ ዘለዎም ዘላ ዘሎ ዘይብሉ  
+ዝርከብ ዝበሃል ዝበለ ዝብል ዝተባህለ ዝተኻየደ ዝተፈላለየ ዝተፈላለዩ
+ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ
+የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ
+ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ
 """.split()
 )
diff --git a/spacy/tests/lang/ti/test_text.py b/spacy/tests/lang/ti/test_text.py
index 177a9e4b2..d21005640 100644
--- a/spacy/tests/lang/ti/test_text.py
+++ b/spacy/tests/lang/ti/test_text.py
@@ -37,7 +37,7 @@ def test_ti_tokenizer_handles_cnts(ti_tokenizer, text, length):
         ("10.000", True),
         ("1000", True),
         ("999,0", True),
-        ("ሐደ", True),
+        ("ሓደ", True),
         ("ክልተ", True),
         ("ትሪልዮን", True),
         ("ከልቢ", False),

From f99d6d5e39c25ebccf13784c5d234be30a688ce4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 10 Aug 2021 15:13:39 +0200
Subject: [PATCH 004/133] Refactor scoring methods to use registered functions
 (#8766)

* Add scorer option to components

Add an optional `scorer` parameter to all pipeline components. If a
scoring function is provided, it overrides the default scoring method
for that component.

* Add registered scorers for all components

* Add `scorers` registry
* Move all scoring methods outside of components as independent
  functions and register
* Use the registered scoring methods as defaults in configs and inits

Additional:

* The scoring methods no longer have access to the full component, so
  use settings from `cfg` as default scorer options to handle settings
  such as `labels`, `threshold`, and `positive_label`
* The `attribute_ruler` scoring method no longer has access to the
  patterns, so all scoring methods are called
* Bug fix: `spancat` scoring method is updated to set `allow_overlap` to
  score overlapping spans correctly

* Update Russian lemmatizer to use direct score method

* Check type of cfg in Pipe.score

* Fix check

* Update spacy/pipeline/sentencizer.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove validate_examples from scoring functions

* Use Pipe.labels instead of Pipe.cfg["labels"]

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/lang/bn/__init__.py                   | 20 ++++-
 spacy/lang/ca/__init__.py                   | 20 ++++-
 spacy/lang/el/__init__.py                   | 20 ++++-
 spacy/lang/en/__init__.py                   | 20 ++++-
 spacy/lang/es/__init__.py                   | 20 ++++-
 spacy/lang/fa/__init__.py                   | 20 ++++-
 spacy/lang/fr/__init__.py                   | 20 ++++-
 spacy/lang/it/__init__.py                   | 20 ++++-
 spacy/lang/mk/__init__.py                   | 20 ++++-
 spacy/lang/nb/__init__.py                   | 20 ++++-
 spacy/lang/nl/__init__.py                   | 20 ++++-
 spacy/lang/pl/__init__.py                   | 20 ++++-
 spacy/lang/ru/__init__.py                   | 14 +++-
 spacy/lang/ru/lemmatizer.py                 |  6 +-
 spacy/lang/sv/__init__.py                   | 20 ++++-
 spacy/lang/uk/__init__.py                   | 20 ++++-
 spacy/lang/uk/lemmatizer.py                 |  6 +-
 spacy/pipeline/attributeruler.py            | 93 +++++++++++----------
 spacy/pipeline/dep_parser.pyx               | 73 +++++++++-------
 spacy/pipeline/entity_linker.py             | 27 +++---
 spacy/pipeline/entityruler.py               | 21 +++--
 spacy/pipeline/lemmatizer.py                | 43 ++++++----
 spacy/pipeline/morphologizer.pyx            | 49 +++++------
 spacy/pipeline/ner.pyx                      | 42 ++++++----
 spacy/pipeline/pipe.pyx                     | 11 +++
 spacy/pipeline/sentencizer.pyx              | 38 ++++-----
 spacy/pipeline/senter.pyx                   | 40 ++++-----
 spacy/pipeline/spancat.py                   | 49 +++++------
 spacy/pipeline/tagger.pyx                   | 32 +++----
 spacy/pipeline/textcat.py                   | 58 +++++++------
 spacy/pipeline/textcat_multilabel.py        | 53 +++++++-----
 spacy/pipeline/trainable_pipe.pxd           |  1 +
 spacy/pipeline/transition_parser.pyx        |  4 +-
 spacy/scorer.py                             |  4 +-
 spacy/tests/pipeline/test_attributeruler.py | 56 +++++++++----
 spacy/util.py                               |  1 +
 36 files changed, 638 insertions(+), 363 deletions(-)

diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 23c3ff485..945560aac 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
@@ -23,13 +23,25 @@ class Bengali(Language):
 @Bengali.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Bengali"]
diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py
index 81f39b13c..15d395c12 100644
--- a/spacy/lang/ca/__init__.py
+++ b/spacy/lang/ca/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 
 from thinc.api import Model
 
@@ -28,13 +28,25 @@ class Catalan(Language):
 @Catalan.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return CatalanLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Catalan"]
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index be59a3500..e843114fc 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@@ -28,13 +28,25 @@ class Greek(Language):
 @Greek.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return GreekLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Greek"]
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index eea522908..a84b50476 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@@ -26,13 +26,25 @@ class English(Language):
 @English.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return EnglishLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["English"]
diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py
index 4b329b6f7..2f246a678 100644
--- a/spacy/lang/es/__init__.py
+++ b/spacy/lang/es/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
@@ -26,13 +26,25 @@ class Spanish(Language):
 @Spanish.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return SpanishLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Spanish"]
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 77a0a28b9..0c3100f2b 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@@ -26,13 +26,25 @@ class Persian(Language):
 @Persian.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Persian"]
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index d69a5a718..254e1651b 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 
 from thinc.api import Model
 
@@ -31,13 +31,25 @@ class French(Language):
 @French.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return FrenchLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["French"]
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index 672a8698e..fc74789a3 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 
 from .stop_words import STOP_WORDS
@@ -23,13 +23,25 @@ class Italian(Language):
 @Italian.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "pos_lookup",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return ItalianLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Italian"]
diff --git a/spacy/lang/mk/__init__.py b/spacy/lang/mk/__init__.py
index 2f6097f16..a8464f3b7 100644
--- a/spacy/lang/mk/__init__.py
+++ b/spacy/lang/mk/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .lemmatizer import MacedonianLemmatizer
 from .stop_words import STOP_WORDS
@@ -38,13 +38,25 @@ class Macedonian(Language):
 @Macedonian.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return MacedonianLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Macedonian"]
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index 0bfde7d28..d08f8f768 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@@ -26,13 +26,25 @@ class Norwegian(Language):
 @Norwegian.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Norwegian"]
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index 5e95b4a8b..0a6480a1d 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 
 from thinc.api import Model
 
@@ -30,13 +30,25 @@ class Dutch(Language):
 @Dutch.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return DutchLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Dutch"]
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 585e08c60..1d71244a2 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 
 from thinc.api import Model
 
@@ -33,13 +33,25 @@ class Polish(Language):
 @Polish.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "pos_lookup",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return PolishLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Polish"]
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 4287cc288..0f645ddb1 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 
 from .stop_words import STOP_WORDS
@@ -22,7 +22,12 @@ class Russian(Language):
 @Russian.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "pymorphy2",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
@@ -31,8 +36,11 @@ def make_lemmatizer(
     name: str,
     mode: str,
     overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return RussianLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Russian"]
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 399cd174c..185e09718 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -1,8 +1,9 @@
-from typing import Optional, List, Dict, Tuple
+from typing import Optional, List, Dict, Tuple, Callable
 
 from thinc.api import Model
 
 from ...pipeline import Lemmatizer
+from ...pipeline.lemmatizer import lemmatizer_score
 from ...symbols import POS
 from ...tokens import Token
 from ...vocab import Vocab
@@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
         *,
         mode: str = "pymorphy2",
         overwrite: bool = False,
+        scorer: Optional[Callable] = lemmatizer_score,
     ) -> None:
         if mode == "pymorphy2":
             try:
@@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer):
                 ) from None
             if getattr(self, "_morph", None) is None:
                 self._morph = MorphAnalyzer()
-        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
 
     def pymorphy2_lemmatize(self, token: Token) -> List[str]:
         string = token.text
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 1b1b69fac..aa8d3f110 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
@@ -29,13 +29,25 @@ class Swedish(Language):
 @Swedish.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Swedish"]
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index 677281ec6..2eef110b2 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 
 from thinc.api import Model
 
@@ -23,13 +23,25 @@ class Ukrainian(Language):
 @Ukrainian.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "pymorphy2",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return UkrainianLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
 
 
 __all__ = ["Ukrainian"]
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index 1fb030e06..fd566a3a8 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -1,8 +1,9 @@
-from typing import Optional
+from typing import Optional, Callable
 
 from thinc.api import Model
 
 from ..ru.lemmatizer import RussianLemmatizer
+from ...pipeline.lemmatizer import lemmatizer_score
 from ...vocab import Vocab
 
 
@@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
         *,
         mode: str = "pymorphy2",
         overwrite: bool = False,
+        scorer: Optional[Callable] = lemmatizer_score,
     ) -> None:
         if mode == "pymorphy2":
             try:
@@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer):
                 ) from None
             if getattr(self, "_morph", None) is None:
                 self._morph = MorphAnalyzer(lang="uk")
-        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index f95a5a48c..733a65199 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -5,15 +5,15 @@ from pathlib import Path
 
 from .pipe import Pipe
 from ..errors import Errors
-from ..training import validate_examples, Example
+from ..training import Example
 from ..language import Language
 from ..matcher import Matcher
 from ..scorer import Scorer
-from ..symbols import IDS, TAG, POS, MORPH, LEMMA
+from ..symbols import IDS
 from ..tokens import Doc, Span
 from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
 from ..vocab import Vocab
-from ..util import SimpleFrozenList
+from ..util import SimpleFrozenList, registry
 from .. import util
 
 
@@ -23,9 +23,43 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
 MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
 
 
-@Language.factory("attribute_ruler", default_config={"validate": False})
-def make_attribute_ruler(nlp: Language, name: str, validate: bool):
-    return AttributeRuler(nlp.vocab, name, validate=validate)
+@Language.factory(
+    "attribute_ruler",
+    default_config={
+        "validate": False,
+        "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
+    },
+)
+def make_attribute_ruler(
+    nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
+):
+    return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
+
+
+def attribute_ruler_score(
+    examples: Iterable[Example], **kwargs
+) -> Dict[str, Any]:
+    def morph_key_getter(token, attr):
+        return getattr(token, attr).key
+
+    results = {}
+    results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
+    results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+    results.update(
+        Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
+    )
+    results.update(
+        Scorer.score_token_attr_per_feat(
+            examples, "morph", getter=morph_key_getter, **kwargs
+        )
+    )
+    results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
+    return results
+
+
+@registry.scorers("spacy.attribute_ruler_scorer.v1")
+def make_attribute_ruler_scorer():
+    return attribute_ruler_score
 
 
 class AttributeRuler(Pipe):
@@ -36,7 +70,12 @@ class AttributeRuler(Pipe):
     """
 
     def __init__(
-        self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
+        self,
+        vocab: Vocab,
+        name: str = "attribute_ruler",
+        *,
+        validate: bool = False,
+        scorer: Optional[Callable] = attribute_ruler_score,
     ) -> None:
         """Create the AttributeRuler. After creation, you can add patterns
         with the `.initialize()` or `.add_patterns()` methods, or load patterns
@@ -57,6 +96,7 @@ class AttributeRuler(Pipe):
         self.attrs = []
         self._attrs_unnormed = []  # store for reference
         self.indices = []
+        self.scorer = scorer
 
     def clear(self) -> None:
         """Reset all patterns."""
@@ -228,45 +268,6 @@ class AttributeRuler(Pipe):
             all_patterns.append(p)
         return all_patterns
 
-    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by
-            Scorer.score_token_attr for the attributes "tag", "pos", "morph"
-            and "lemma" for the target token attributes.
-
-        DOCS: https://spacy.io/api/tagger#score
-        """
-
-        def morph_key_getter(token, attr):
-            return getattr(token, attr).key
-
-        validate_examples(examples, "AttributeRuler.score")
-        results = {}
-        attrs = set()
-        for token_attrs in self.attrs:
-            attrs.update(token_attrs)
-        for attr in attrs:
-            if attr == TAG:
-                results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
-            elif attr == POS:
-                results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
-            elif attr == MORPH:
-                results.update(
-                    Scorer.score_token_attr(
-                        examples, "morph", getter=morph_key_getter, **kwargs
-                    )
-                )
-                results.update(
-                    Scorer.score_token_attr_per_feat(
-                        examples, "morph", getter=morph_key_getter, **kwargs
-                    )
-                )
-            elif attr == LEMMA:
-                results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
-        return results
-
     def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
         """Serialize the AttributeRuler to a bytestring.
 
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index be23ab0dd..59364326b 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -1,6 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from collections import defaultdict
-from typing import Optional, Iterable
+from typing import Optional, Iterable, Callable
 from thinc.api import Model, Config
 
 from ._parser_internals.transition_system import TransitionSystem
@@ -12,7 +12,7 @@ from ..language import Language
 from ._parser_internals import nonproj
 from ._parser_internals.nonproj import DELIMITER
 from ..scorer import Scorer
-from ..training import validate_examples
+from ..util import registry
 
 
 default_model_config = """
@@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
         "learn_tokens": False,
         "min_action_freq": 30,
         "model": DEFAULT_PARSER_MODEL,
+        "scorer": {"@scorers": "spacy.parser_scorer.v1"},
     },
     default_score_weights={
         "dep_uas": 0.5,
@@ -63,7 +64,8 @@ def make_parser(
     moves: Optional[TransitionSystem],
     update_with_oracle_cut_size: int,
     learn_tokens: bool,
-    min_action_freq: int
+    min_action_freq: int,
+    scorer: Optional[Callable],
 ):
     """Create a transition-based DependencyParser component. The dependency parser
     jointly learns sentence segmentation and labelled dependency parsing, and can
@@ -115,7 +117,8 @@ def make_parser(
         beam_update_prob=0.0,
         # At some point in the future we can try to implement support for
         # partial annotations, perhaps only in the beam objective.
-        incorrect_spans_key=None
+        incorrect_spans_key=None,
+        scorer=scorer,
     )
 
 @Language.factory(
@@ -130,6 +133,7 @@ def make_parser(
         "learn_tokens": False,
         "min_action_freq": 30,
         "model": DEFAULT_PARSER_MODEL,
+        "scorer": {"@scorers": "spacy.parser_scorer.v1"},
     },
     default_score_weights={
         "dep_uas": 0.5,
@@ -151,6 +155,7 @@ def make_beam_parser(
     beam_width: int,
     beam_density: float,
     beam_update_prob: float,
+    scorer: Optional[Callable],
 ):
     """Create a transition-based DependencyParser component that uses beam-search.
     The dependency parser jointly learns sentence segmentation and labelled
@@ -207,10 +212,41 @@ def make_beam_parser(
         min_action_freq=min_action_freq,
         # At some point in the future we can try to implement support for
         # partial annotations, perhaps only in the beam objective.
-        incorrect_spans_key=None
+        incorrect_spans_key=None,
+        scorer=scorer,
     )
 
 
+def parser_score(examples, **kwargs):
+    """Score a batch of examples.
+
+    examples (Iterable[Example]): The examples to score.
+    RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
+        and Scorer.score_deps.
+
+    DOCS: https://spacy.io/api/dependencyparser#score
+    """
+    def has_sents(doc):
+        return doc.has_annotation("SENT_START")
+
+    def dep_getter(token, attr):
+        dep = getattr(token, attr)
+        dep = token.vocab.strings.as_string(dep).lower()
+        return dep
+    results = {}
+    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+    kwargs.setdefault("getter", dep_getter)
+    kwargs.setdefault("ignore_labels", ("p", "punct"))
+    results.update(Scorer.score_deps(examples, "dep", **kwargs))
+    del results["sents_per_type"]
+    return results
+
+
+@registry.scorers("spacy.parser_scorer.v1")
+def make_parser_scorer():
+    return parser_score
+
+
 cdef class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
@@ -233,6 +269,7 @@ cdef class DependencyParser(Parser):
         beam_update_prob=0.0,
         multitasks=tuple(),
         incorrect_spans_key=None,
+        scorer=parser_score,
     ):
         """Create a DependencyParser.
         """
@@ -249,6 +286,7 @@ cdef class DependencyParser(Parser):
             beam_update_prob=beam_update_prob,
             multitasks=multitasks,
             incorrect_spans_key=incorrect_spans_key,
+            scorer=scorer,
         )
 
     @property
@@ -281,31 +319,6 @@ cdef class DependencyParser(Parser):
                 labels.add(label)
         return tuple(sorted(labels))
 
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
-            and Scorer.score_deps.
-
-        DOCS: https://spacy.io/api/dependencyparser#score
-        """
-        def has_sents(doc):
-            return doc.has_annotation("SENT_START")
-
-        validate_examples(examples, "DependencyParser.score")
-        def dep_getter(token, attr):
-            dep = getattr(token, attr)
-            dep = token.vocab.strings.as_string(dep).lower()
-            return dep
-        results = {}
-        results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
-        kwargs.setdefault("getter", dep_getter)
-        kwargs.setdefault("ignore_labels", ("p", "punct"))
-        results.update(Scorer.score_deps(examples, "dep", **kwargs))
-        del results["sents_per_type"]
-        return results
-
     def scored_parses(self, beams):
         """Return two dictionaries with scores for each beam/doc that was processed:
         one containing (i, head) keys, and another containing (i, label) keys.
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 7b52025bc..919d1fe6d 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -16,7 +16,7 @@ from ..language import Language
 from ..vocab import Vocab
 from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors, Warnings
-from ..util import SimpleFrozenList
+from ..util import SimpleFrozenList, registry
 from .. import util
 from ..scorer import Scorer
 
@@ -50,6 +50,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
         "incl_context": True,
         "entity_vector_length": 64,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
+        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
     },
     default_score_weights={
         "nel_micro_f": 1.0,
@@ -68,6 +69,7 @@ def make_entity_linker(
     incl_context: bool,
     entity_vector_length: int,
     get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+    scorer: Optional[Callable],
 ):
     """Construct an EntityLinker component.
 
@@ -92,9 +94,19 @@ def make_entity_linker(
         incl_context=incl_context,
         entity_vector_length=entity_vector_length,
         get_candidates=get_candidates,
+        scorer=scorer,
     )
 
 
+def entity_linker_score(examples, **kwargs):
+    return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
+
+
+@registry.scorers("spacy.entity_linker_scorer.v1")
+def make_entity_linker_scorer():
+    return entity_linker_score
+
+
 class EntityLinker(TrainablePipe):
     """Pipeline component for named entity linking.
 
@@ -115,6 +127,7 @@ class EntityLinker(TrainablePipe):
         incl_context: bool,
         entity_vector_length: int,
         get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+        scorer: Optional[Callable] = entity_linker_score,
     ) -> None:
         """Initialize an entity linker.
 
@@ -145,6 +158,7 @@ class EntityLinker(TrainablePipe):
         # how many neighbour sentences to take into account
         # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
         self.kb = empty_kb(entity_vector_length)(self.vocab)
+        self.scorer = scorer
 
     def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
         """Define the KB of this pipe by providing a function that will
@@ -389,17 +403,6 @@ class EntityLinker(TrainablePipe):
                 for token in ent:
                     token.ent_kb_id_ = kb_id
 
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores.
-
-        DOCS TODO: https://spacy.io/api/entity_linker#score
-        """
-        validate_examples(examples, "EntityLinker.score")
-        return Scorer.score_links(examples, negative_labels=[self.NIL])
-
     def to_bytes(self, *, exclude=tuple()):
         """Serialize the pipe to a bytestring.
 
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 1dea8fba0..d2a0c5045 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -8,11 +8,10 @@ from .pipe import Pipe
 from ..training import Example
 from ..language import Language
 from ..errors import Errors, Warnings
-from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
+from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
 from ..scorer import get_ner_prf
-from ..training import validate_examples
 
 
 DEFAULT_ENT_ID_SEP = "||"
@@ -27,6 +26,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
         "validate": False,
         "overwrite_ents": False,
         "ent_id_sep": DEFAULT_ENT_ID_SEP,
+        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
     },
     default_score_weights={
         "ents_f": 1.0,
@@ -42,6 +42,7 @@ def make_entity_ruler(
     validate: bool,
     overwrite_ents: bool,
     ent_id_sep: str,
+    scorer: Optional[Callable],
 ):
     return EntityRuler(
         nlp,
@@ -50,9 +51,19 @@ def make_entity_ruler(
         validate=validate,
         overwrite_ents=overwrite_ents,
         ent_id_sep=ent_id_sep,
+        scorer=scorer,
     )
 
 
+def entity_ruler_score(examples, **kwargs):
+    return get_ner_prf(examples)
+
+
+@registry.scorers("spacy.entity_ruler_scorer.v1")
+def make_entity_ruler_scorer():
+    return entity_ruler_score
+
+
 class EntityRuler(Pipe):
     """The EntityRuler lets you add spans to the `Doc.ents` using token-based
     rules or exact phrase matches. It can be combined with the statistical
@@ -74,6 +85,7 @@ class EntityRuler(Pipe):
         overwrite_ents: bool = False,
         ent_id_sep: str = DEFAULT_ENT_ID_SEP,
         patterns: Optional[List[PatternType]] = None,
+        scorer: Optional[Callable] = entity_ruler_score,
     ) -> None:
         """Initialize the entity ruler. If patterns are supplied here, they
         need to be a list of dictionaries with a `"label"` and `"pattern"`
@@ -112,6 +124,7 @@ class EntityRuler(Pipe):
         self._ent_ids = defaultdict(dict)
         if patterns is not None:
             self.add_patterns(patterns)
+        self.scorer = scorer
 
     def __len__(self) -> int:
         """The number of all patterns added to the entity ruler."""
@@ -358,10 +371,6 @@ class EntityRuler(Pipe):
             label = f"{label}{self.ent_id_sep}{ent_id}"
         return label
 
-    def score(self, examples, **kwargs):
-        validate_examples(examples, "EntityRuler.score")
-        return get_ner_prf(examples)
-
     def from_bytes(
         self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
     ) -> "EntityRuler":
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 2f436c57a..1bf513661 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
 from ..vocab import Vocab
-from ..training import validate_examples
-from ..util import logger, SimpleFrozenList
+from ..util import logger, SimpleFrozenList, registry
 from .. import util
 
 
 @Language.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "lookup", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "lookup",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
+
+
+def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    return Scorer.score_token_attr(examples, "lemma", **kwargs)
+
+
+@registry.scorers("spacy.lemmatizer_scorer.v1")
+def make_lemmatizer_scorer():
+    return lemmatizer_score
 
 
 class Lemmatizer(Pipe):
@@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
         *,
         mode: str = "lookup",
         overwrite: bool = False,
+        scorer: Optional[Callable] = lemmatizer_score,
     ) -> None:
         """Initialize a Lemmatizer.
 
@@ -89,6 +110,7 @@ class Lemmatizer(Pipe):
                 raise ValueError(Errors.E1003.format(mode=mode))
             self.lemmatize = getattr(self, mode_attr)
         self.cache = {}
+        self.scorer = scorer
 
     @property
     def mode(self):
@@ -247,17 +269,6 @@ class Lemmatizer(Pipe):
         """
         return False
 
-    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores.
-
-        DOCS: https://spacy.io/api/lemmatizer#score
-        """
-        validate_examples(examples, "Lemmatizer.score")
-        return Scorer.score_token_attr(examples, "lemma", **kwargs)
-
     def to_disk(
         self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
     ):
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 3ba05e616..c5293e860 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,5 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, Union, Dict
+from typing import Optional, Union, Dict, Callable
 import srsly
 from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from itertools import islice
@@ -17,6 +17,7 @@ from .tagger import Tagger
 from .. import util
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
+from ..util import registry
 
 
 default_model_config = """
@@ -48,15 +49,33 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "morphologizer",
     assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL},
+    default_config={"model": DEFAULT_MORPH_MODEL, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
     default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
     nlp: Language,
     model: Model,
     name: str,
+    scorer: Optional[Callable],
 ):
-    return Morphologizer(nlp.vocab, model, name)
+    return Morphologizer(nlp.vocab, model, name, scorer=scorer)
+
+
+def morphologizer_score(examples, **kwargs):
+    def morph_key_getter(token, attr):
+        return getattr(token, attr).key
+
+    results = {}
+    results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+    results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
+    results.update(Scorer.score_token_attr_per_feat(examples,
+        "morph", getter=morph_key_getter, **kwargs))
+    return results
+
+
+@registry.scorers("spacy.morphologizer_scorer.v1")
+def make_morphologizer_scorer():
+    return morphologizer_score
 
 
 class Morphologizer(Tagger):
@@ -67,6 +86,8 @@ class Morphologizer(Tagger):
         vocab: Vocab,
         model: Model,
         name: str = "morphologizer",
+        *,
+        scorer: Optional[Callable] = morphologizer_score,
     ):
         """Initialize a morphologizer.
 
@@ -87,6 +108,7 @@ class Morphologizer(Tagger):
         # 2) labels_pos stores a mapping from morph+POS->POS
         cfg = {"labels_morph": {}, "labels_pos": {}}
         self.cfg = dict(sorted(cfg.items()))
+        self.scorer = scorer
 
     @property
     def labels(self):
@@ -246,24 +268,3 @@ class Morphologizer(Tagger):
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
         return float(loss), d_scores
-
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by
-            Scorer.score_token_attr for the attributes "pos" and "morph" and
-            Scorer.score_token_attr_per_feat for the attribute "morph".
-
-        DOCS: https://spacy.io/api/morphologizer#score
-        """
-        def morph_key_getter(token, attr):
-            return getattr(token, attr).key
-
-        validate_examples(examples, "Morphologizer.score")
-        results = {}
-        results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
-        results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
-        results.update(Scorer.score_token_attr_per_feat(examples,
-            "morph", getter=morph_key_getter, **kwargs))
-        return results
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index f4ae4b787..857e3c088 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -1,6 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from collections import defaultdict
-from typing import Optional, Iterable
+from typing import Optional, Iterable, Callable
 from thinc.api import Model, Config
 
 from ._parser_internals.transition_system import TransitionSystem
@@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 
 from ..language import Language
 from ..scorer import get_ner_prf, PRFScore
-from ..training import validate_examples
+from ..util import registry
 
 
 default_model_config = """
@@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
         "moves": None,
         "update_with_oracle_cut_size": 100,
         "model": DEFAULT_NER_MODEL,
-        "incorrect_spans_key": None
+        "incorrect_spans_key": None,
+        "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
     default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 
@@ -52,7 +53,8 @@ def make_ner(
     model: Model,
     moves: Optional[TransitionSystem],
     update_with_oracle_cut_size: int,
-    incorrect_spans_key: Optional[str]=None
+    incorrect_spans_key: Optional[str],
+    scorer: Optional[Callable],
 ):
     """Create a transition-based EntityRecognizer component. The entity recognizer
     identifies non-overlapping labelled spans of tokens.
@@ -92,6 +94,7 @@ def make_ner(
         beam_width=1,
         beam_density=0.0,
         beam_update_prob=0.0,
+        scorer=scorer,
     )
 
 @Language.factory(
@@ -104,7 +107,8 @@ def make_ner(
         "beam_density": 0.01,
         "beam_update_prob": 0.5,
         "beam_width": 32,
-        "incorrect_spans_key": None
+        "incorrect_spans_key": None,
+        "scorer": None,
     },
     default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
@@ -117,7 +121,8 @@ def make_beam_ner(
     beam_width: int,
     beam_density: float,
     beam_update_prob: float,
-    incorrect_spans_key: Optional[str]=None
+    incorrect_spans_key: Optional[str],
+    scorer: Optional[Callable],
 ):
     """Create a transition-based EntityRecognizer component that uses beam-search.
     The entity recognizer identifies non-overlapping labelled spans of tokens.
@@ -164,10 +169,20 @@ def make_beam_ner(
         beam_width=beam_width,
         beam_density=beam_density,
         beam_update_prob=beam_update_prob,
-        incorrect_spans_key=incorrect_spans_key
+        incorrect_spans_key=incorrect_spans_key,
+        scorer=scorer,
     )
 
 
+def ner_score(examples, **kwargs):
+    return get_ner_prf(examples, **kwargs)
+
+
+@registry.scorers("spacy.ner_scorer.v1")
+def make_ner_scorer():
+    return ner_score
+
+
 cdef class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
@@ -188,6 +203,7 @@ cdef class EntityRecognizer(Parser):
         beam_update_prob=0.0,
         multitasks=tuple(),
         incorrect_spans_key=None,
+        scorer=ner_score,
     ):
         """Create an EntityRecognizer.
         """
@@ -204,6 +220,7 @@ cdef class EntityRecognizer(Parser):
             beam_update_prob=beam_update_prob,
             multitasks=multitasks,
             incorrect_spans_key=incorrect_spans_key,
+            scorer=scorer,
         )
 
     def add_multitask_objective(self, mt_component):
@@ -227,17 +244,6 @@ cdef class EntityRecognizer(Parser):
                      if move[0] in ("B", "I", "L", "U"))
         return tuple(sorted(labels))
 
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
-
-        DOCS: https://spacy.io/api/entityrecognizer#score
-        """
-        validate_examples(examples, "EntityRecognizer.score")
-        return get_ner_prf(examples)
-
     def scored_ents(self, beams):
         """Return a dictionary of (start, end, label) tuples with corresponding scores
         for each beam/doc that was processed.
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 0d298ce4f..14f9f08f8 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -81,6 +81,17 @@ cdef class Pipe:
 
         DOCS: https://spacy.io/api/pipe#score
         """
+        if hasattr(self, "scorer") and self.scorer is not None:
+            scorer_kwargs = {}
+            # use default settings from cfg (e.g., threshold)
+            if hasattr(self, "cfg") and isinstance(self.cfg, dict):
+                scorer_kwargs.update(self.cfg)
+            # override self.cfg["labels"] with self.labels
+            if hasattr(self, "labels"):
+                scorer_kwargs["labels"] = self.labels
+            # override with kwargs settings
+            scorer_kwargs.update(kwargs)
+            return self.scorer(examples, **scorer_kwargs)
         return {}
 
     @property
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 60102efcb..c535c7e48 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -1,26 +1,29 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, List
+from typing import Optional, List, Callable
 import srsly
 
 from ..tokens.doc cimport Doc
+
 from .pipe import Pipe
+from .senter import senter_score
 from ..language import Language
 from ..scorer import Scorer
-from ..training import validate_examples
 from .. import util
 
+
 @Language.factory(
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
-    default_config={"punct_chars": None},
+    default_config={"punct_chars": None, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
 def make_sentencizer(
     nlp: Language,
     name: str,
-    punct_chars: Optional[List[str]]
+    punct_chars: Optional[List[str]],
+    scorer: Optional[Callable],
 ):
-    return Sentencizer(name, punct_chars=punct_chars)
+    return Sentencizer(name, punct_chars=punct_chars, scorer=scorer)
 
 
 class Sentencizer(Pipe):
@@ -41,7 +44,13 @@ class Sentencizer(Pipe):
             '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
             '｡', '。']
 
-    def __init__(self, name="sentencizer", *, punct_chars=None):
+    def __init__(
+        self,
+        name="sentencizer",
+        *,
+        punct_chars=None,
+        scorer=senter_score,
+    ):
         """Initialize the sentencizer.
 
         punct_chars (list): Punctuation characters to split on. Will be
@@ -55,6 +64,7 @@ class Sentencizer(Pipe):
             self.punct_chars = set(punct_chars)
         else:
             self.punct_chars = set(self.default_punct_chars)
+        self.scorer = scorer
 
     def __call__(self, doc):
         """Apply the sentencizer to a Doc and set Token.is_sent_start.
@@ -122,22 +132,6 @@ class Sentencizer(Pipe):
                     else:
                         doc.c[j].sent_start = -1
 
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
-
-        DOCS: https://spacy.io/api/sentencizer#score
-        """
-        def has_sents(doc):
-            return doc.has_annotation("SENT_START")
-
-        validate_examples(examples, "Sentencizer.score")
-        results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
-        del results["sents_per_type"]
-        return results
-
     def to_bytes(self, *, exclude=tuple()):
         """Serialize the sentencizer to a bytestring.
 
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index f9472abf5..3eeb9b5da 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,5 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from itertools import islice
+from typing import Optional, Callable
 
 import srsly
 from thinc.api import Model, SequenceCategoricalCrossentropy, Config
@@ -11,6 +12,7 @@ from ..language import Language
 from ..errors import Errors
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
+from ..util import registry
 from .. import util
 
 
@@ -34,11 +36,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "senter",
     assigns=["token.is_sent_start"],
-    default_config={"model": DEFAULT_SENTER_MODEL},
+    default_config={"model": DEFAULT_SENTER_MODEL, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
-def make_senter(nlp: Language, name: str, model: Model):
-    return SentenceRecognizer(nlp.vocab, model, name)
+def make_senter(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
+    return SentenceRecognizer(nlp.vocab, model, name, scorer=scorer)
+
+
+def senter_score(examples, **kwargs):
+    def has_sents(doc):
+        return doc.has_annotation("SENT_START")
+
+    results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+    del results["sents_per_type"]
+    return results
+
+
+@registry.scorers("spacy.senter_scorer.v1")
+def make_senter_scorer():
+    return senter_score
 
 
 class SentenceRecognizer(Tagger):
@@ -46,7 +62,7 @@ class SentenceRecognizer(Tagger):
 
     DOCS: https://spacy.io/api/sentencerecognizer
     """
-    def __init__(self, vocab, model, name="senter"):
+    def __init__(self, vocab, model, name="senter", *, scorer=senter_score):
         """Initialize a sentence recognizer.
 
         vocab (Vocab): The shared vocabulary.
@@ -61,6 +77,7 @@ class SentenceRecognizer(Tagger):
         self.name = name
         self._rehearsal_model = None
         self.cfg = {}
+        self.scorer = scorer
 
     @property
     def labels(self):
@@ -153,18 +170,3 @@ class SentenceRecognizer(Tagger):
 
     def add_label(self, label, values=None):
         raise NotImplementedError
-
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
-        DOCS: https://spacy.io/api/sentencerecognizer#score
-        """
-        def has_sents(doc):
-            return doc.has_annotation("SENT_START")
-
-        validate_examples(examples, "SentenceRecognizer.score")
-        results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
-        del results["sents_per_type"]
-        return results
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 8d1be06c3..a143ac4eb 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -98,6 +98,7 @@ def build_ngram_range_suggester(
         "max_positive": None,
         "model": DEFAULT_SPANCAT_MODEL,
         "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
+        "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
     },
     default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@@ -107,8 +108,9 @@ def make_spancat(
     suggester: Callable[[List[Doc]], Ragged],
     model: Model[Tuple[List[Doc], Ragged], Floats2d],
     spans_key: str,
-    threshold: float = 0.5,
-    max_positive: Optional[int] = None,
+    scorer: Optional[Callable],
+    threshold: float,
+    max_positive: Optional[int],
 ) -> "SpanCategorizer":
     """Create a SpanCategorizer component. The span categorizer consists of two
     parts: a suggester function that proposes candidate spans, and a labeller
@@ -138,9 +140,28 @@ def make_spancat(
         threshold=threshold,
         max_positive=max_positive,
         name=name,
+        scorer=scorer,
     )
 
 
+def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    kwargs = dict(kwargs)
+    attr_prefix = "spans_"
+    key = kwargs["spans_key"]
+    kwargs.setdefault("attr", f"{attr_prefix}{key}")
+    kwargs.setdefault("allow_overlap", True)
+    kwargs.setdefault(
+        "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
+    )
+    kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
+    return Scorer.score_spans(examples, **kwargs)
+
+
+@registry.scorers("spacy.spancat_scorer.v1")
+def make_spancat_scorer():
+    return spancat_score
+
+
 class SpanCategorizer(TrainablePipe):
     """Pipeline component to label spans of text.
 
@@ -157,6 +178,7 @@ class SpanCategorizer(TrainablePipe):
         spans_key: str = "spans",
         threshold: float = 0.5,
         max_positive: Optional[int] = None,
+        scorer: Optional[Callable] = spancat_score,
     ) -> None:
         """Initialize the span categorizer.
 
@@ -172,6 +194,7 @@ class SpanCategorizer(TrainablePipe):
         self.suggester = suggester
         self.model = model
         self.name = name
+        self.scorer = scorer
 
     @property
     def key(self) -> str:
@@ -373,28 +396,6 @@ class SpanCategorizer(TrainablePipe):
         else:
             self.model.initialize()
 
-    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
-
-        DOCS: https://spacy.io/api/spancategorizer#score
-        """
-        validate_examples(examples, "SpanCategorizer.score")
-        self._validate_categories(examples)
-        kwargs = dict(kwargs)
-        attr_prefix = "spans_"
-        kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
-        kwargs.setdefault("labels", self.labels)
-        kwargs.setdefault("multi_label", True)
-        kwargs.setdefault("threshold", self.cfg["threshold"])
-        kwargs.setdefault(
-            "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
-        )
-        kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
-        return Scorer.score_spans(examples, **kwargs)
-
     def _validate_categories(self, examples):
         # TODO
         pass
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index fa260bdd6..327a18f25 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Optional
 import numpy
 import srsly
 from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
@@ -18,6 +19,7 @@ from ..parts_of_speech import X
 from ..errors import Errors, Warnings
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
+from ..util import registry
 from .. import util
 
 
@@ -41,10 +43,10 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "tagger",
     assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL},
+    default_config={"model": DEFAULT_TAGGER_MODEL, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
     default_score_weights={"tag_acc": 1.0},
 )
-def make_tagger(nlp: Language, name: str, model: Model):
+def make_tagger(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
     """Construct a part-of-speech tagger component.
 
     model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
@@ -52,7 +54,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
         in size, and be normalized as probabilities (all scores between 0 and 1,
         with the rows summing to 1).
     """
-    return Tagger(nlp.vocab, model, name)
+    return Tagger(nlp.vocab, model, name, scorer=scorer)
+
+
+def tagger_score(examples, **kwargs):
+    return Scorer.score_token_attr(examples, "tag", **kwargs)
+
+
+@registry.scorers("spacy.tagger_scorer.v1")
+def make_tagger_scorer():
+    return tagger_score
 
 
 class Tagger(TrainablePipe):
@@ -60,7 +71,7 @@ class Tagger(TrainablePipe):
 
     DOCS: https://spacy.io/api/tagger
     """
-    def __init__(self, vocab, model, name="tagger"):
+    def __init__(self, vocab, model, name="tagger", *, scorer=tagger_score):
         """Initialize a part-of-speech tagger.
 
         vocab (Vocab): The shared vocabulary.
@@ -76,6 +87,7 @@ class Tagger(TrainablePipe):
         self._rehearsal_model = None
         cfg = {"labels": []}
         self.cfg = dict(sorted(cfg.items()))
+        self.scorer = scorer
 
     @property
     def labels(self):
@@ -289,15 +301,3 @@ class Tagger(TrainablePipe):
         self.cfg["labels"].append(label)
         self.vocab.strings.add(label)
         return 1
-
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by
-            Scorer.score_token_attr for the attributes "tag".
-
-        DOCS: https://spacy.io/api/tagger#score
-        """
-        validate_examples(examples, "Tagger.score")
-        return Scorer.score_token_attr(examples, "tag", **kwargs)
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 0dde5de82..5ede18424 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors
 from ..scorer import Scorer
 from ..tokens import Doc
+from ..util import registry
 from ..vocab import Vocab
 
 
@@ -70,7 +71,11 @@ subword_features = true
 @Language.factory(
     "textcat",
     assigns=["doc.cats"],
-    default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
+    default_config={
+        "threshold": 0.5,
+        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
+        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
+    },
     default_score_weights={
         "cats_score": 1.0,
         "cats_score_desc": None,
@@ -86,7 +91,11 @@ subword_features = true
     },
 )
 def make_textcat(
-    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
+    nlp: Language,
+    name: str,
+    model: Model[List[Doc], List[Floats2d]],
+    threshold: float,
+    scorer: Optional[Callable],
 ) -> "TextCategorizer":
     """Create a TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
@@ -96,7 +105,21 @@ def make_textcat(
         scores for each category.
     threshold (float): Cutoff to consider a prediction "positive".
     """
-    return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
+    return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
+
+
+def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    return Scorer.score_cats(
+        examples,
+        "cats",
+        multi_label=False,
+        **kwargs,
+    )
+
+
+@registry.scorers("spacy.textcat_scorer.v1")
+def make_textcat_scorer():
+    return textcat_score
 
 
 class TextCategorizer(TrainablePipe):
@@ -106,7 +129,13 @@ class TextCategorizer(TrainablePipe):
     """
 
     def __init__(
-        self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
+        self,
+        vocab: Vocab,
+        model: Model,
+        name: str = "textcat",
+        *,
+        threshold: float,
+        scorer: Optional[Callable] = textcat_score,
     ) -> None:
         """Initialize a text categorizer for single-label classification.
 
@@ -124,6 +153,7 @@ class TextCategorizer(TrainablePipe):
         self._rehearsal_model = None
         cfg = {"labels": [], "threshold": threshold, "positive_label": None}
         self.cfg = dict(cfg)
+        self.scorer = scorer
 
     @property
     def labels(self) -> Tuple[str]:
@@ -354,26 +384,6 @@ class TextCategorizer(TrainablePipe):
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
 
-    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
-
-        DOCS: https://spacy.io/api/textcategorizer#score
-        """
-        validate_examples(examples, "TextCategorizer.score")
-        self._validate_categories(examples)
-        kwargs.setdefault("threshold", self.cfg["threshold"])
-        kwargs.setdefault("positive_label", self.cfg["positive_label"])
-        return Scorer.score_cats(
-            examples,
-            "cats",
-            labels=self.labels,
-            multi_label=False,
-            **kwargs,
-        )
-
     def _validate_categories(self, examples: List[Example]):
         """Check whether the provided examples all have single-label cats annotations."""
         for ex in examples:
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index ba36881af..efa7d28b5 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -5,10 +5,11 @@ from thinc.api import Model, Config
 from thinc.types import Floats2d
 
 from ..language import Language
-from ..training import Example, validate_examples, validate_get_examples
+from ..training import Example, validate_get_examples
 from ..errors import Errors
 from ..scorer import Scorer
 from ..tokens import Doc
+from ..util import registry
 from ..vocab import Vocab
 from .textcat import TextCategorizer
 
@@ -70,7 +71,11 @@ subword_features = true
 @Language.factory(
     "textcat_multilabel",
     assigns=["doc.cats"],
-    default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
+    default_config={
+        "threshold": 0.5,
+        "model": DEFAULT_MULTI_TEXTCAT_MODEL,
+        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+    },
     default_score_weights={
         "cats_score": 1.0,
         "cats_score_desc": None,
@@ -86,7 +91,11 @@ subword_features = true
     },
 )
 def make_multilabel_textcat(
-    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
+    nlp: Language,
+    name: str,
+    model: Model[List[Doc], List[Floats2d]],
+    threshold: float,
+    scorer: Optional[Callable],
 ) -> "TextCategorizer":
     """Create a TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
@@ -97,7 +106,23 @@ def make_multilabel_textcat(
         scores for each category.
     threshold (float): Cutoff to consider a prediction "positive".
     """
-    return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
+    return MultiLabel_TextCategorizer(
+        nlp.vocab, model, name, threshold=threshold, scorer=scorer
+    )
+
+
+def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    return Scorer.score_cats(
+        examples,
+        "cats",
+        multi_label=True,
+        **kwargs,
+    )
+
+
+@registry.scorers("spacy.textcat_multilabel_scorer.v1")
+def make_textcat_multilabel_scorer():
+    return textcat_multilabel_score
 
 
 class MultiLabel_TextCategorizer(TextCategorizer):
@@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
         name: str = "textcat_multilabel",
         *,
         threshold: float,
+        scorer: Optional[Callable] = textcat_multilabel_score,
     ) -> None:
         """Initialize a text categorizer for multi-label classification.
 
@@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
         self._rehearsal_model = None
         cfg = {"labels": [], "threshold": threshold}
         self.cfg = dict(cfg)
+        self.scorer = scorer
 
     def initialize(
         self,
@@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer):
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
 
-    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
-
-        DOCS: https://spacy.io/api/textcategorizer#score
-        """
-        validate_examples(examples, "MultiLabel_TextCategorizer.score")
-        kwargs.setdefault("threshold", self.cfg["threshold"])
-        return Scorer.score_cats(
-            examples,
-            "cats",
-            labels=self.labels,
-            multi_label=True,
-            **kwargs,
-        )
-
     def _validate_categories(self, examples: List[Example]):
         """This component allows any type of single- or multi-label annotations.
         This method overwrites the more strict one from 'textcat'."""
diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd
index d5cdbb511..65daa8b22 100644
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
     cdef public Vocab vocab
     cdef public object model
     cdef public object cfg
+    cdef public object scorer
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 5e11f5972..fa2b28aa5 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
         beam_density=0.0,
         beam_update_prob=0.0,
         multitasks=tuple(),
-        incorrect_spans_key=None
+        incorrect_spans_key=None,
+        scorer=None,
     ):
         """Create a Parser.
 
@@ -117,6 +118,7 @@ cdef class Parser(TrainablePipe):
             self.add_multitask_objective(multitask)
 
         self._rehearsal_model = None
+        self.scorer = scorer
 
     def __getnewargs_ex__(self):
         """This allows pickling the Parser and its keyword-only init arguments"""
diff --git a/spacy/scorer.py b/spacy/scorer.py
index f4ccb2269..bd305c123 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -537,7 +537,7 @@ class Scorer:
 
     @staticmethod
     def score_links(
-        examples: Iterable[Example], *, negative_labels: Iterable[str]
+        examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
     ) -> Dict[str, Any]:
         """Returns PRF for predicted links on the entity level.
         To disentangle the performance of the NEL from the NER,
@@ -711,7 +711,7 @@ class Scorer:
             }
 
 
-def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
+def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
     """Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
     score_per_type = defaultdict(PRFScore)
     for eg in examples:
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index 9c750ffd0..dab3ebf57 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -32,24 +32,6 @@ def pattern_dicts():
     ]
 
 
-@registry.misc("attribute_ruler_patterns")
-def attribute_ruler_patterns():
-    return [
-        {
-            "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
-            "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
-        },
-        # one pattern sets the lemma
-        {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
-        # another pattern sets the morphology
-        {
-            "patterns": [[{"ORTH": "test"}]],
-            "attrs": {"MORPH": "Case=Nom|Number=Sing"},
-            "index": 0,
-        },
-    ]
-
-
 @pytest.fixture
 def tag_map():
     return {
@@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     assert doc.has_annotation("LEMMA")
     assert doc.has_annotation("MORPH")
     nlp.remove_pipe("attribute_ruler")
+
     # initialize with patterns from misc registry
+    @registry.misc("attribute_ruler_patterns")
+    def attribute_ruler_patterns():
+        return [
+            {
+                "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
+                "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
+            },
+            # one pattern sets the lemma
+            {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
+            # another pattern sets the morphology
+            {
+                "patterns": [[{"ORTH": "test"}]],
+                "attrs": {"MORPH": "Case=Nom|Number=Sing"},
+                "index": 0,
+            },
+        ]
+
     nlp.config["initialize"]["components"]["attribute_ruler"] = {
         "patterns": {"@misc": "attribute_ruler_patterns"}
     }
@@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
     assert scores["lemma_acc"] == pytest.approx(0.2)
     # no morphs are set
     assert scores["morph_acc"] is None
+    nlp.remove_pipe("attribute_ruler")
+
+    # test with custom scorer
+    @registry.misc("weird_scorer.v1")
+    def make_weird_scorer():
+        def weird_scorer(examples, weird_score, **kwargs):
+            return {"weird_score": weird_score}
+
+        return weird_scorer
+
+    ruler = nlp.add_pipe(
+        "attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
+    )
+    ruler.initialize(lambda: [], patterns=pattern_dicts)
+    scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
+    assert scores["weird_score"] == 0.12345
+    assert "token_acc" in scores
+    assert "lemma_acc" not in scores
+    scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
+    assert scores["weird_score"] == 0.23456
 
 
 def test_attributeruler_rule_order(nlp):
diff --git a/spacy/util.py b/spacy/util.py
index 421287ce2..5270bf080 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -95,6 +95,7 @@ class registry(thinc.registry):
     readers = catalogue.create("spacy", "readers", entry_points=True)
     augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
     loggers = catalogue.create("spacy", "loggers", entry_points=True)
+    scorers = catalogue.create("spacy", "scorers", entry_points=True)
     # These are factories registered via third-party packages and the
     # spacy_factories entry point. This registry only exists so we can easily
     # load them via the entry points. The "true" factories are added via the

From 944ad6b1d4a330c5e67e5c873749da239e1bc586 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Thu, 12 Aug 2021 11:14:48 +0200
Subject: [PATCH 005/133] Add new parameter for saving every n epoch in
 pretraining (#8912)

* Add parameter for saving every n epoch

* Add new parameter in schemas

* Add new parameter in default_config

* Adjust schemas

* format code
---
 spacy/default_config_pretraining.cfg |  1 +
 spacy/schemas.py                     |  3 ++-
 spacy/training/pretrain.py           | 12 ++++++++++--
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg
index 16f767772..d70ecf04c 100644
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@@ -5,6 +5,7 @@ raw_text = null
 max_epochs = 1000
 dropout = 0.2
 n_save_every = null
+n_save_epoch = null
 component = "tok2vec"
 layer = ""
 corpus = "corpora.pretrain"
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 83623b104..bd3f0ecf0 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -351,7 +351,8 @@ class ConfigSchemaPretrain(BaseModel):
     # fmt: off
     max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
     dropout: StrictFloat = Field(..., title="Dropout rate")
-    n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
+    n_save_every: Optional[StrictInt] = Field(..., title="Saving additional temporary model after n batches within an epoch")
+    n_save_epoch: Optional[StrictInt] = Field(..., title="Saving model after every n epoch")
     optimizer: Optimizer = Field(..., title="The optimizer to use")
     corpus: StrictStr = Field(..., title="Path in the config to the training data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index 6d7850212..88f1dc0bb 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -48,7 +48,10 @@ def pretrain(
     objective = model.attrs["loss"]
     # TODO: move this to logger function?
     tracker = ProgressTracker(frequency=10000)
-    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
+    if P["n_save_epoch"]:
+        msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch")
+    else:
+        msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
     row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
     msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 
@@ -77,7 +80,12 @@ def pretrain(
                 msg.row(progress, **row_settings)
             if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
                 _save_model(epoch, is_temp=True)
-        _save_model(epoch)
+
+        if P["n_save_epoch"]:
+            if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
+                _save_model(epoch)
+        else:
+            _save_model(epoch)
         tracker.epoch_loss = 0.0
 
 

From b278f31ee684e5d402a1891a0445a9c7c1c1f644 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 12 Aug 2021 12:50:03 +0200
Subject: [PATCH 006/133] Document scorers in registry and components from
 #8766 (#8929)

* Document scorers in registry and components from #8766

* Update spacy/pipeline/lemmatizer.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/dependencyparser.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Reformat

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/attributeruler.py       |  8 +++++---
 spacy/pipeline/dep_parser.pyx          |  1 +
 spacy/pipeline/entity_linker.py        |  3 +++
 spacy/pipeline/entityruler.py          |  2 ++
 spacy/pipeline/lemmatizer.py           |  2 ++
 spacy/pipeline/morphologizer.pyx       |  3 +++
 spacy/pipeline/ner.pyx                 |  2 ++
 spacy/pipeline/sentencizer.pyx         |  3 ++-
 spacy/pipeline/senter.pyx              |  2 ++
 spacy/pipeline/spancat.py              | 16 +++++++++++++++
 spacy/pipeline/tagger.pyx              |  2 ++
 spacy/pipeline/textcat.py              |  3 +++
 spacy/pipeline/transition_parser.pyx   |  1 +
 website/docs/api/attributeruler.md     | 28 +++++++-------------------
 website/docs/api/dependencyparser.md   | 16 +--------------
 website/docs/api/entitylinker.md       | 16 +--------------
 website/docs/api/entityrecognizer.md   | 16 +--------------
 website/docs/api/entityruler.md        |  1 +
 website/docs/api/lemmatizer.md         | 12 ++++++-----
 website/docs/api/morphologizer.md      | 12 ++++++-----
 website/docs/api/pipe.md               | 10 +++++----
 website/docs/api/scorer.md             | 20 ++++++++++++++----
 website/docs/api/sentencerecognizer.md | 27 +++++++------------------
 website/docs/api/sentencizer.md        | 26 ++++++------------------
 website/docs/api/spancategorizer.md    | 17 +---------------
 website/docs/api/tagger.md             | 27 +++++++------------------
 website/docs/api/textcategorizer.md    | 15 +++++++-------
 website/docs/api/top-level.md          |  1 +
 28 files changed, 121 insertions(+), 171 deletions(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 733a65199..b1a2f3e9c 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -36,9 +36,7 @@ def make_attribute_ruler(
     return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
 
 
-def attribute_ruler_score(
-    examples: Iterable[Example], **kwargs
-) -> Dict[str, Any]:
+def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
     def morph_key_getter(token, attr):
         return getattr(token, attr).key
 
@@ -84,6 +82,10 @@ class AttributeRuler(Pipe):
 
         vocab (Vocab): The vocab.
         name (str): The pipe name. Defaults to "attribute_ruler".
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_token_attr for the attributes "tag", "pos", "morph" and
+            "lemma" and Scorer.score_token_attr_per_feat for the attribute
+            "morph".
 
         RETURNS (AttributeRuler): The AttributeRuler component.
 
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 59364326b..50c57ee5b 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -102,6 +102,7 @@ def make_parser(
         primarily affects the label accuracy, it can also affect the attachment
         structure, as the labels are used to represent the pseudo-projectivity
         transformation.
+    scorer (Optional[Callable]): The scoring method.
     """
     return DependencyParser(
         nlp.vocab,
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 919d1fe6d..a17eed13c 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -83,6 +83,7 @@ def make_entity_linker(
     entity_vector_length (int): Size of encoding vectors in the KB.
     get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
         produces a list of candidates, given a certain knowledge base and a textual mention.
+    scorer (Optional[Callable]): The scoring method.
     """
     return EntityLinker(
         nlp.vocab,
@@ -142,6 +143,8 @@ class EntityLinker(TrainablePipe):
         entity_vector_length (int): Size of encoding vectors in the KB.
         get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
             produces a list of candidates, given a certain knowledge base and a textual mention.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_links.
 
         DOCS: https://spacy.io/api/entitylinker#init
         """
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index d2a0c5045..ad67a7a1f 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -106,6 +106,8 @@ class EntityRuler(Pipe):
         overwrite_ents (bool): If existing entities are present, e.g. entities
             added by the model, overwrite them by matches if necessary.
         ent_id_sep (str): Separator used internally for entity IDs.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            spacy.scorer.get_ner_prf.
 
         DOCS: https://spacy.io/api/entityruler#init
         """
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 1bf513661..74f502d80 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -90,6 +90,8 @@ class Lemmatizer(Pipe):
         mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
         overwrite (bool): Whether to overwrite existing lemmas. Defaults to
             `False`.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_token_attr for the attribute "lemma".
 
         DOCS: https://spacy.io/api/lemmatizer#init
         """
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index c5293e860..f476f02c4 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -95,6 +95,9 @@ class Morphologizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_token_attr for the attributes "pos" and "morph" and
+            Scorer.score_token_attr_per_feat for the attribute "morph".
 
         DOCS: https://spacy.io/api/morphologizer#init
         """
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 857e3c088..4835a8c4b 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -82,6 +82,7 @@ def make_ner(
     incorrect_spans_key (Optional[str]): Identifies spans that are known
         to be incorrect entity annotations. The incorrect entity annotations
         can be stored in the span group, under this key.
+    scorer (Optional[Callable]): The scoring method.
     """
     return EntityRecognizer(
         nlp.vocab,
@@ -158,6 +159,7 @@ def make_beam_ner(
         and are faster to compute.
     incorrect_spans_key (Optional[str]): Optional key into span groups of
         entities known to be non-entities.
+    scorer (Optional[Callable]): The scoring method.
     """
     return EntityRecognizer(
         nlp.vocab,
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index c535c7e48..5e64246ad 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -55,7 +55,8 @@ class Sentencizer(Pipe):
 
         punct_chars (list): Punctuation characters to split on. Will be
             serialized with the nlp object.
-        RETURNS (Sentencizer): The sentencizer component.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_spans for the attribute "sents".
 
         DOCS: https://spacy.io/api/sentencizer#init
         """
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 3eeb9b5da..b1fb2ec37 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -69,6 +69,8 @@ class SentenceRecognizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_spans for the attribute "sents".
 
         DOCS: https://spacy.io/api/sentencerecognizer#init
         """
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index a143ac4eb..6bc93d693 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -181,6 +181,22 @@ class SpanCategorizer(TrainablePipe):
         scorer: Optional[Callable] = spancat_score,
     ) -> None:
         """Initialize the span categorizer.
+        vocab (Vocab): The shared vocabulary.
+        model (thinc.api.Model): The Thinc Model powering the pipeline component.
+        name (str): The component instance name, used to add entries to the
+            losses during training.
+        spans_key (str): Key of the Doc.spans dict to save the spans under.
+            During initialization and training, the component will look for
+            spans on the reference document under the same key. Defaults to
+            `"spans"`.
+        threshold (float): Minimum probability to consider a prediction
+            positive. Spans with a positive prediction will be saved on the Doc.
+            Defaults to 0.5.
+        max_positive (Optional[int]): Maximum number of labels to consider
+            positive per span. Defaults to None, indicating no limit.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_spans for the Doc.spans[spans_key] with overlapping
+            spans allowed.
 
         DOCS: https://spacy.io/api/spancategorizer#init
         """
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 327a18f25..16d16b497 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -78,6 +78,8 @@ class Tagger(TrainablePipe):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_token_attr for the attribute "tag".
 
         DOCS: https://spacy.io/api/tagger#init
         """
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 5ede18424..6956a919d 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -104,6 +104,7 @@ def make_textcat(
     model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
         scores for each category.
     threshold (float): Cutoff to consider a prediction "positive".
+    scorer (Optional[Callable]): The scoring method.
     """
     return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
 
@@ -144,6 +145,8 @@ class TextCategorizer(TrainablePipe):
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
+        scorer (Optional[Callable]): The scoring method. Defaults to
+                Scorer.score_cats for the attribute "cats".
 
         DOCS: https://spacy.io/api/textcategorizer#init
         """
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index fa2b28aa5..2571af102 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -87,6 +87,7 @@ cdef class Parser(TrainablePipe):
         incorrect_spans_key (Optional[str]): Identifies spans that are known
             to be incorrect entity annotations. The incorrect entity annotations
             can be stored in the span group, under this key.
+        scorer (Optional[Callable]): The scoring method. Defaults to None.
         """
         self.vocab = vocab
         self.name = name
diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md
index a253ca9f8..965bffbcc 100644
--- a/website/docs/api/attributeruler.md
+++ b/website/docs/api/attributeruler.md
@@ -48,12 +48,13 @@ Initialize the attribute ruler.
 > ruler = nlp.add_pipe("attribute_ruler")
 > ```
 
-| Name           | Description                                                                                                                              |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary to pass to the matcher. ~~Vocab~~                                                                                  |
-| `name`         | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
-| _keyword-only_ |                                                                                                                                          |
-| `validate`     | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~                       |
+| Name           | Description                                                                                                                                                                                                                                                                                |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `vocab`        | The shared vocabulary to pass to the matcher. ~~Vocab~~                                                                                                                                                                                                                                    |
+| `name`         | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~                                                                                                                                                   |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                            |
+| `validate`     | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~                                                                                                                                                                         |
+| `scorer`       | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag`", `"pos"`, `"morph"` and `"lemma"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 
 ## AttributeRuler.\_\_call\_\_ {#call tag="method"}
 
@@ -175,21 +176,6 @@ Load attribute ruler patterns from morph rules.
 | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
 
-## AttributeRuler.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = ruler.score(examples)
-> ```
-
-| Name        | Description                                                                                                                                                                                                           |
-| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                                                                                                                          |
-| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ |
-
 ## AttributeRuler.to_disk {#to_disk tag="method"}
 
 Serialize the pipe to disk.
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index fa02a6f99..3d326a41b 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -91,6 +91,7 @@ shortcut for this and instantiate the component using its string name and
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~                                                 |
 | `learn_tokens`                | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~                                                                                                                                                                       |
 | `min_action_freq`             | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |
+| `scorer`                      | The scoring method. Defaults to [`Scorer.score_deps`](/api/scorer#score_deps) for the attribute `"dep"` ignoring the labels `p` and `punct` and [`Scorer.score_spans`](/api/scorer/#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~                                                |
 
 ## DependencyParser.\_\_call\_\_ {#call tag="method"}
 
@@ -259,21 +260,6 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions. ~~StateClass~~                 |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
-## DependencyParser.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = parser.score(examples)
-> ```
-
-| Name        | Description                                                                                                                                                              |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                                                                             |
-| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
-
 ## DependencyParser.create_optimizer {#create_optimizer tag="method"}
 
 Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 2994d934b..764f63a1a 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -50,6 +50,7 @@ architectures and their arguments and hyperparameters.
 | `model`                | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
 | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                            |
 | `get_candidates`       | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| `scorer`               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                  |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@@ -259,21 +260,6 @@ pipe's entity linking model and context encoder. Delegates to
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 
-## EntityLinker.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = entity_linker.score(examples)
-> ```
-
-| Name        | Description                                                                                    |
-| ----------- | ---------------------------------------------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                   |
-| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
-
 ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 601b644c1..68c048428 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -48,6 +48,7 @@ architectures and their arguments and hyperparameters.
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~                                                 |
 | `incorrect_spans_key`         | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~                                                                     |
+| `scorer`                      | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                                       |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/ner.pyx
@@ -251,21 +252,6 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions. ~~StateClass~~                 |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
-## EntityRecognizer.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = ner.score(examples)
-> ```
-
-| Name        | Description                                               |
-| ----------- | --------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~              |
-| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
-
 ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 93b5da45a..63b4d1823 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -40,6 +40,7 @@ how the component should be configured. You can override its settings via the
 | `validate`            | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~                                                                             |
 | `overwrite_ents`      | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                     |
 | `ent_id_sep`          | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                       |
+| `scorer`              | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entityruler.py
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 279821e71..b67673599 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -47,11 +47,13 @@ data format used by the lookup and rule-based lemmatizers, see
 > nlp.add_pipe("lemmatizer", config=config)
 > ```
 
-| Setting     | Description                                                                                                                                               |
-| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `mode`      | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ |
-| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~                                                                                       |
-| `model`     | **Not yet implemented:** the model to use. ~~Model~~                                                                                                      |
+| Setting        | Description                                                                                                                                               |
+| -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mode`         | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ |
+| `overwrite`    | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~                                                                                       |
+| `model`        | **Not yet implemented:** the model to use. ~~Model~~                                                                                                      |
+| _keyword-only_ |                                                                                                                                                           |
+| `scorer`       | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~             |
 
 Many languages specify a default lemmatizer mode other than `lookup` if a better
 lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index d2dd28ac2..d2a927f4b 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -61,11 +61,13 @@ shortcut for this and instantiate the component using its string name and
 > morphologizer = Morphologizer(nlp.vocab, model)
 > ```
 
-| Name    | Description                                                                                                          |
-| ------- | -------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~                                                                                     |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name`  | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                  |
+| Name           | Description                                                                                                                                                                                                                                                            |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                       |
+| `model`        | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                   |
+| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                    |
+| _keyword-only_ |                                                                                                                                                                                                                                                                        |
+| `scorer`       | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 
 ## Morphologizer.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index 2f856c667..263942e3e 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -297,10 +297,12 @@ Score a batch of examples.
 > scores = pipe.score(examples)
 > ```
 
-| Name        | Description                                                                                             |
-| ----------- | ------------------------------------------------------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                            |
-| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name           | Description                                                                                             |
+| -------------- | ------------------------------------------------------------------------------------------------------- |
+| `examples`     | The examples to score. ~~Iterable[Example]~~                                                            |
+| _keyword-only_ |
+| `\*\*kwargs`   | Any additional settings to pass on to the scorer. ~~Any~~                                               |
+| **RETURNS**    | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
 
 ## TrainablePipe.create_optimizer {#create_optimizer tag="method"}
 
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index ad908f204..da7da5f82 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -27,9 +27,13 @@ Create a new `Scorer`.
 > scorer = Scorer(nlp)
 > ```
 
-| Name  | Description                                                                                                                                                                                                                                                                         |
-| ----- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. ~~Language~~ |
+| Name               | Description                                                                                                                                                                                                                               |
+| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `nlp`              | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ |
+| `default_lang`     | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~                                                                                                                                            |
+| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~                                                     |
+| _keyword-only_     |                                                                                                                                                                                                                                           |
+| `\*\*kwargs`       | Any additional settings to pass on to the individual scoring methods. ~~Any~~                                                                                                                                                             |
 
 ## Scorer.score {#score tag="method"}
 
@@ -80,7 +84,7 @@ Docs with `has_unknown_spaces` are skipped during scoring.
 > ```
 
 | Name        | Description                                                                                                         |
-| ----------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
+| ----------- | ------------------------------------------------------------------------------------------------------------------- |
 | `examples`  | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
 | **RETURNS** | `Dict`                                                                                                              | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ |
 
@@ -253,3 +257,11 @@ entities that overlap between the gold reference and the predictions.
 | _keyword-only_    |                                                                                                                     |
 | `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~                                       |
 | **RETURNS**       | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~                                                  |
+
+## get_ner_prf {#get_ner_prf new="3"}
+
+Compute micro-PRF and per-entity PRF scores.
+
+| Name       | Description                                                                                                         |
+| ---------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md
index e82a4bef6..d6d82c259 100644
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@@ -60,11 +60,13 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
-| Name    | Description                                                                                                          |
-| ------- | -------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~                                                                                     |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name`  | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                  |
+| Name           | Description                                                                                                                         |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                    |
+| `model`        | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                |
+| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                 |
+| _keyword-only_ |                                                                                                                                     |
+| `scorer`       | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ |
 
 ## SentenceRecognizer.\_\_call\_\_ {#call tag="method"}
 
@@ -238,21 +240,6 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
-## SentenceRecognizer.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = senter.score(examples)
-> ```
-
-| Name        | Description                                                                                                                                               |
-| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                                                              |
-| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ |
-
 ## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md
index 75a253fc0..4570e8746 100644
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@@ -28,7 +28,7 @@ how the component should be configured. You can override its settings via the
 > ```
 
 | Setting       | Description                                                                                                                                            |
-| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
 
 ```python
@@ -50,10 +50,11 @@ Initialize the sentencizer.
 > sentencizer = Sentencizer()
 > ```
 
-| Name           | Description                                                                                                             |
-| -------------- | ----------------------------------------------------------------------------------------------------------------------- |
-| _keyword-only_ |                                                                                                                         |
-| `punct_chars`  | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ |
+| Name           | Description                                                                                                                        |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+| _keyword-only_ |                                                                                                                                    |
+| `punct_chars`  | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~            |
+| `scorer`       | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~ |
 
 ```python
 ### punct_chars defaults
@@ -112,21 +113,6 @@ applied to the `Doc` in order.
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Sentencizer.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = sentencizer.score(examples)
-> ```
-
-| Name        | Description                                                                                                           |
-| ----------- | --------------------------------------------------------------------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                          |
-| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]~~ |
-
 ## Sentencizer.to_disk {#to_disk tag="method"}
 
 Save the sentencizer settings (punctuation characters) to a directory. Will
diff --git a/website/docs/api/spancategorizer.md b/website/docs/api/spancategorizer.md
index 57395846d..8748b23a2 100644
--- a/website/docs/api/spancategorizer.md
+++ b/website/docs/api/spancategorizer.md
@@ -43,6 +43,7 @@ architectures and their arguments and hyperparameters.
 | `spans_key`    | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~                                                                               |
 | `threshold`    | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~                                                                                                                                                          |
 | `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~                                                                                                                                                                                      |
+| `scorer`       | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                                                                                                       |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/spancat.py
@@ -241,22 +242,6 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
-## SpanCategorizer.score {#score tag="method"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = spancat.score(examples)
-> ```
-
-| Name           | Description                                                                                                            |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------- |
-| `examples`     | The examples to score. ~~Iterable[Example]~~                                                                           |
-| _keyword-only_ |                                                                                                                        |
-| **RETURNS**    | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
-
 ## SpanCategorizer.create_optimizer {#create_optimizer tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index 3002aff7b..c37483ca4 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -54,11 +54,13 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
-| Name    | Description                                                                                                                                                                                                                                           |
-| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                      |
-| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name`  | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                   |
+| Name           | Description                                                                                                                                                                                                                                           |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                      |
+| `model`        | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                   |
+| _keyword-only_ |                                                                                                                                                                                                                                                       |
+| `scorer`       | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                           |
 
 ## Tagger.\_\_call\_\_ {#call tag="method"}
 
@@ -249,21 +251,6 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
-## Tagger.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = tagger.score(examples)
-> ```
-
-| Name        | Description                                                                                                                       |
-| ----------- | --------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                                      |
-| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Dict[str, float]~~ |
-
 ## Tagger.create_optimizer {#create_optimizer tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 923da0048..4b1348fa4 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -96,13 +96,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#create_pipe).
 
-| Name           | Description                                                                                                                |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                           |
-| `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                        |
-| _keyword-only_ |                                                                                                                            |
-| `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                             |
+| Name           | Description                                                                                                                      |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                 |
+| `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
+| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
+| _keyword-only_ |                                                                                                                                  |
+| `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                   |
+| `scorer`       | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
 
 ## TextCategorizer.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 8190d9f78..be45add72 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -373,6 +373,7 @@ factories.
 | `optimizers`      | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers).                                                                                                                                                             |
 | `readers`         | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus).                                                                                                                                   |
 | `schedules`       | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules).                                                                                                                                                               |
+| `scorers`         | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`.                                |
 | `tokenizers`      | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable.                                                                   |
 
 ### spacy-transformers registry {#registry-transformers}

From c5de9b463a30dbb1cd016d4919e4348e55416d5c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 19 Aug 2021 14:37:47 +0200
Subject: [PATCH 007/133] Update custom tokenizer APIs and pickling (#8972)

* Fix incorrect pickling of Japanese and Korean pipelines, which led to
the entire pipeline being reset if pickled

* Enable pickling of Vietnamese tokenizer

* Update tokenizer APIs for Chinese, Japanese, Korean, Thai, and
Vietnamese so that only the `Vocab` is required for initialization
---
 spacy/lang/ja/__init__.py             | 17 +++++++----------
 spacy/lang/ko/__init__.py             | 19 ++++++++-----------
 spacy/lang/th/__init__.py             |  7 ++++---
 spacy/lang/vi/__init__.py             | 10 +++++++---
 spacy/lang/zh/__init__.py             |  7 ++++---
 spacy/tests/lang/ja/test_serialize.py |  8 ++++++++
 spacy/tests/lang/ko/test_serialize.py | 24 ++++++++++++++++++++++++
 spacy/tests/lang/th/test_serialize.py | 24 ++++++++++++++++++++++++
 spacy/tests/lang/vi/test_serialize.py |  8 ++++++++
 9 files changed, 94 insertions(+), 30 deletions(-)
 create mode 100644 spacy/tests/lang/ko/test_serialize.py
 create mode 100644 spacy/tests/lang/th/test_serialize.py

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 4e6bf9d3c..12e65413a 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from .tag_map import TAG_MAP
 from .tag_orth_map import TAG_ORTH_MAP
 from .tag_bigram_map import TAG_BIGRAM_MAP
-from ...compat import copy_reg
 from ...errors import Errors
 from ...language import Language
 from ...scorer import Scorer
@@ -16,6 +15,7 @@ from ...symbols import POS
 from ...tokens import Doc
 from ...training import validate_examples
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
 from ... import util
 
 
@@ -31,17 +31,20 @@ split_mode = null
 @registry.tokenizers("spacy.ja.JapaneseTokenizer")
 def create_tokenizer(split_mode: Optional[str] = None):
     def japanese_tokenizer_factory(nlp):
-        return JapaneseTokenizer(nlp, split_mode=split_mode)
+        return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
 
     return japanese_tokenizer_factory
 
 
 class JapaneseTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
-        self.vocab = nlp.vocab
+    def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
+        self.vocab = vocab
         self.split_mode = split_mode
         self.tokenizer = try_sudachi_import(self.split_mode)
 
+    def __reduce__(self):
+        return JapaneseTokenizer, (self.vocab, self.split_mode)
+
     def __call__(self, text: str) -> Doc:
         # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
         sudachipy_tokens = self.tokenizer.tokenize(text)
@@ -293,10 +296,4 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
     return text_dtokens, text_spaces
 
 
-def pickle_japanese(instance):
-    return Japanese, tuple()
-
-
-copy_reg.pickle(Japanese, pickle_japanese)
-
 __all__ = ["Japanese"]
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index 83c9f4962..daa445e09 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -1,15 +1,15 @@
-from typing import Optional, Any, Dict
+from typing import Any, Dict
 
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
-from ...compat import copy_reg
 from ...scorer import Scorer
 from ...symbols import POS
 from ...training import validate_examples
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
 
 
 DEFAULT_CONFIG = """
@@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
 @registry.tokenizers("spacy.ko.KoreanTokenizer")
 def create_tokenizer():
     def korean_tokenizer_factory(nlp):
-        return KoreanTokenizer(nlp)
+        return KoreanTokenizer(nlp.vocab)
 
     return korean_tokenizer_factory
 
 
 class KoreanTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Optional[Language] = None):
-        self.vocab = nlp.vocab
+    def __init__(self, vocab: Vocab):
+        self.vocab = vocab
         MeCab = try_mecab_import()
         self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
 
+    def __reduce__(self):
+        return KoreanTokenizer, (self.vocab,)
+
     def __del__(self):
         self.mecab_tokenizer.__del__()
 
@@ -106,10 +109,4 @@ def check_spaces(text, tokens):
         yield False
 
 
-def pickle_korean(instance):
-    return Korean, tuple()
-
-
-copy_reg.pickle(Korean, pickle_korean)
-
 __all__ = ["Korean"]
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 219c50c1a..a89d4dc77 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
 
 
 DEFAULT_CONFIG = """
@@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
 @registry.tokenizers("spacy.th.ThaiTokenizer")
 def create_thai_tokenizer():
     def thai_tokenizer_factory(nlp):
-        return ThaiTokenizer(nlp)
+        return ThaiTokenizer(nlp.vocab)
 
     return thai_tokenizer_factory
 
 
 class ThaiTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language) -> None:
+    def __init__(self, vocab: Vocab) -> None:
         try:
             from pythainlp.tokenize import word_tokenize
         except ImportError:
@@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
                 "https://github.com/PyThaiNLP/pythainlp"
             ) from None
         self.word_tokenize = word_tokenize
-        self.vocab = nlp.vocab
+        self.vocab = vocab
 
     def __call__(self, text: str) -> Doc:
         words = list(self.word_tokenize(text))
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index b6d873a13..afc715ff3 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
 from ... import util
 
 
@@ -24,14 +25,14 @@ use_pyvi = true
 @registry.tokenizers("spacy.vi.VietnameseTokenizer")
 def create_vietnamese_tokenizer(use_pyvi: bool = True):
     def vietnamese_tokenizer_factory(nlp):
-        return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
+        return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
 
     return vietnamese_tokenizer_factory
 
 
 class VietnameseTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language, use_pyvi: bool = False):
-        self.vocab = nlp.vocab
+    def __init__(self, vocab: Vocab, use_pyvi: bool = False):
+        self.vocab = vocab
         self.use_pyvi = use_pyvi
         if self.use_pyvi:
             try:
@@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
                 )
                 raise ImportError(msg) from None
 
+    def __reduce__(self):
+        return VietnameseTokenizer, (self.vocab, self.use_pyvi)
+
     def __call__(self, text: str) -> Doc:
         if self.use_pyvi:
             words = self.pyvi_tokenize(text)
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 9a8a21a63..c6dd7bb85 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -11,6 +11,7 @@ from ...scorer import Scorer
 from ...tokens import Doc
 from ...training import validate_examples, Example
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from ... import util
@@ -48,14 +49,14 @@ class Segmenter(str, Enum):
 @registry.tokenizers("spacy.zh.ChineseTokenizer")
 def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
     def chinese_tokenizer_factory(nlp):
-        return ChineseTokenizer(nlp, segmenter=segmenter)
+        return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
 
     return chinese_tokenizer_factory
 
 
 class ChineseTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
-        self.vocab = nlp.vocab
+    def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
+        self.vocab = vocab
         if isinstance(segmenter, Segmenter):
             segmenter = segmenter.value
         self.segmenter = segmenter
diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py
index e05a363bf..011eb470f 100644
--- a/spacy/tests/lang/ja/test_serialize.py
+++ b/spacy/tests/lang/ja/test_serialize.py
@@ -1,3 +1,5 @@
+import pickle
+
 from spacy.lang.ja import Japanese
 from ...util import make_tempdir
 
@@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
         nlp_r.from_disk(d)
         assert nlp_bytes == nlp_r.to_bytes()
         assert nlp_r.tokenizer.split_mode == "B"
+
+
+def test_ja_tokenizer_pickle(ja_tokenizer):
+    b = pickle.dumps(ja_tokenizer)
+    ja_tokenizer_re = pickle.loads(b)
+    assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()
diff --git a/spacy/tests/lang/ko/test_serialize.py b/spacy/tests/lang/ko/test_serialize.py
new file mode 100644
index 000000000..75288fcc5
--- /dev/null
+++ b/spacy/tests/lang/ko/test_serialize.py
@@ -0,0 +1,24 @@
+import pickle
+
+from spacy.lang.ko import Korean
+from ...util import make_tempdir
+
+
+def test_ko_tokenizer_serialize(ko_tokenizer):
+    tokenizer_bytes = ko_tokenizer.to_bytes()
+    nlp = Korean()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        ko_tokenizer.to_disk(file_path)
+        nlp = Korean()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+
+def test_ko_tokenizer_pickle(ko_tokenizer):
+    b = pickle.dumps(ko_tokenizer)
+    ko_tokenizer_re = pickle.loads(b)
+    assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
diff --git a/spacy/tests/lang/th/test_serialize.py b/spacy/tests/lang/th/test_serialize.py
new file mode 100644
index 000000000..a3de4bf54
--- /dev/null
+++ b/spacy/tests/lang/th/test_serialize.py
@@ -0,0 +1,24 @@
+import pickle
+
+from spacy.lang.th import Thai
+from ...util import make_tempdir
+
+
+def test_th_tokenizer_serialize(th_tokenizer):
+    tokenizer_bytes = th_tokenizer.to_bytes()
+    nlp = Thai()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        th_tokenizer.to_disk(file_path)
+        nlp = Thai()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+
+def test_th_tokenizer_pickle(th_tokenizer):
+    b = pickle.dumps(th_tokenizer)
+    th_tokenizer_re = pickle.loads(b)
+    assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()
diff --git a/spacy/tests/lang/vi/test_serialize.py b/spacy/tests/lang/vi/test_serialize.py
index ed4652df7..55dab799c 100644
--- a/spacy/tests/lang/vi/test_serialize.py
+++ b/spacy/tests/lang/vi/test_serialize.py
@@ -1,3 +1,5 @@
+import pickle
+
 from spacy.lang.vi import Vietnamese
 from ...util import make_tempdir
 
@@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
         nlp_r.from_disk(d)
         assert nlp_bytes == nlp_r.to_bytes()
         assert nlp_r.tokenizer.use_pyvi is False
+
+
+def test_vi_tokenizer_pickle(vi_tokenizer):
+    b = pickle.dumps(vi_tokenizer)
+    vi_tokenizer_re = pickle.loads(b)
+    assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()

From 0f01f46e028a0709424c28e68a882f6a5246bcb5 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 14 Sep 2021 00:02:17 +0900
Subject: [PATCH 008/133] Update Cython string types (#9143)

* Replace all basestring references with unicode

`basestring` was a compatability type introduced by Cython to make
dealing with utf-8 strings in Python2 easier. In Python3 it is
equivalent to the unicode (or str) type.

I replaced all references to basestring with unicode, since that was
used elsewhere, but we could also just replace them with str, which
shoudl also be equivalent.

All tests pass locally.

* Replace all references to unicode type with str

Since we only support python3 this is simpler.

* Remove all references to unicode type

This removes all references to the unicode type across the codebase and
replaces them with `str`, which makes it more drastic than the prior
commits. In order to make this work importing `unicode_literals` had to
be removed, and one explicit unicode literal also had to be removed (it
is unclear why this is necessary in Cython with language level 3, but
without doing it there were errors about implicit conversion).

When `unicode` is used as a type in comments it was also edited to be
`str`.

Additionally `coding: utf8` headers were removed from a few files.
---
 spacy/attrs.pyx                               |  2 +-
 spacy/kb.pyx                                  | 20 ++++++------
 spacy/lang/en/lemmatizer.py                   |  2 +-
 spacy/lexeme.pyx                              | 12 +++----
 spacy/matcher/dependencymatcher.pyx           |  8 ++---
 spacy/matcher/matcher.pyx                     | 14 ++++----
 .../pipeline/_parser_internals/arc_eager.pyx  |  2 +-
 spacy/strings.pxd                             |  6 ++--
 spacy/strings.pyx                             | 20 ++++++------
 spacy/tests/lang/ky/test_tokenizer.py         |  3 --
 spacy/tokenizer.pxd                           |  8 ++---
 spacy/tokenizer.pyx                           | 32 +++++++++----------
 spacy/tokens/_serialize.py                    |  2 +-
 spacy/tokens/doc.pyx                          |  6 ++--
 spacy/tokens/span.pyx                         |  8 ++---
 spacy/tokens/token.pyx                        |  8 ++---
 spacy/vocab.pxd                               |  6 ++--
 spacy/vocab.pyx                               | 28 ++++++++--------
 18 files changed, 90 insertions(+), 97 deletions(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 9122de17b..640fb2f3c 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
     for name, value in stringy_attrs.items():
         int_key = intify_attr(name)
         if int_key is not None:
-            if strings_map is not None and isinstance(value, basestring):
+            if strings_map is not None and isinstance(value, str):
                 if hasattr(strings_map, 'add'):
                     value = strings_map.add(value)
                 else:
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index d8514b54c..421a8241a 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -122,7 +122,7 @@ cdef class KnowledgeBase:
     def get_alias_strings(self):
         return [self.vocab.strings[x] for x in self._alias_index]
 
-    def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
+    def add_entity(self, str entity, float freq, vector[float] entity_vector):
         """
         Add an entity to the KB, optionally specifying its log probability based on corpus frequency
         Return the hash of the entity ID/name at the end.
@@ -182,15 +182,15 @@ cdef class KnowledgeBase:
 
             i += 1
 
-    def contains_entity(self, unicode entity):
+    def contains_entity(self, str entity):
         cdef hash_t entity_hash = self.vocab.strings.add(entity)
         return entity_hash in self._entry_index
 
-    def contains_alias(self, unicode alias):
+    def contains_alias(self, str alias):
         cdef hash_t alias_hash = self.vocab.strings.add(alias)
         return alias_hash in self._alias_index
 
-    def add_alias(self, unicode alias, entities, probabilities):
+    def add_alias(self, str alias, entities, probabilities):
         """
         For a given alias, add its potential entities and prior probabilies to the KB.
         Return the alias_hash at the end
@@ -236,7 +236,7 @@ cdef class KnowledgeBase:
             raise RuntimeError(Errors.E891.format(alias=alias))
         return alias_hash
 
-    def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
+    def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
         """
         For an alias already existing in the KB, extend its potential entities with one more.
         Throw a warning if either the alias or the entity is unknown,
@@ -283,7 +283,7 @@ cdef class KnowledgeBase:
             alias_entry.probs = probs
             self._aliases_table[alias_index] = alias_entry
 
-    def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
+    def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
         """
         Return candidate entities for an alias. Each candidate defines the entity, the original alias,
         and the prior probability of that alias resolving to that entity.
@@ -304,7 +304,7 @@ cdef class KnowledgeBase:
                 for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                 if entry_index != 0]
 
-    def get_vector(self, unicode entity):
+    def get_vector(self, str entity):
         cdef hash_t entity_hash = self.vocab.strings[entity]
 
         # Return an empty list if this entity is unknown in this KB
@@ -314,7 +314,7 @@ cdef class KnowledgeBase:
 
         return self._vectors_table[self._entries[entry_index].vector_index]
 
-    def get_prior_prob(self, unicode entity, unicode alias):
+    def get_prior_prob(self, str entity, str alias):
         """ Return the prior probability of a given alias being linked to a given entity,
         or return 0.0 when this combination is not known in the knowledge base"""
         cdef hash_t alias_hash = self.vocab.strings[alias]
@@ -582,7 +582,7 @@ cdef class Writer:
     def __init__(self, path):
         assert isinstance(path, Path)
         content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
         self._fp = fopen(<char*>bytes_loc, 'wb')
         if not self._fp:
             raise IOError(Errors.E146.format(path=path))
@@ -624,7 +624,7 @@ cdef class Writer:
 cdef class Reader:
     def __init__(self, path):
         content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
         self._fp = fopen(<char*>bytes_loc, 'rb')
         if not self._fp:
             PyErr_SetFromErrno(IOError)
diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py
index 2cb0f9a53..c88b69bcc 100644
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
         Check whether we're dealing with an uninflected paradigm, so we can
         avoid lemmatization entirely.
 
-        univ_pos (unicode / int): The token's universal part-of-speech tag.
+        univ_pos (str / int): The token's universal part-of-speech tag.
         morphology (dict): The token's morphological features following the
             Universal Dependencies scheme.
         """
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 3564b6e42..792e405dd 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -284,7 +284,7 @@ cdef class Lexeme:
         def __get__(self):
             return self.vocab.strings[self.c.lower]
 
-        def __set__(self, unicode x):
+        def __set__(self, str x):
             self.c.lower = self.vocab.strings.add(x)
 
     property norm_:
@@ -294,7 +294,7 @@ cdef class Lexeme:
         def __get__(self):
             return self.vocab.strings[self.c.norm]
 
-        def __set__(self, unicode x):
+        def __set__(self, str x):
             self.norm = self.vocab.strings.add(x)
 
     property shape_:
@@ -304,7 +304,7 @@ cdef class Lexeme:
         def __get__(self):
             return self.vocab.strings[self.c.shape]
 
-        def __set__(self, unicode x):
+        def __set__(self, str x):
             self.c.shape = self.vocab.strings.add(x)
 
     property prefix_:
@@ -314,7 +314,7 @@ cdef class Lexeme:
         def __get__(self):
             return self.vocab.strings[self.c.prefix]
 
-        def __set__(self, unicode x):
+        def __set__(self, str x):
             self.c.prefix = self.vocab.strings.add(x)
 
     property suffix_:
@@ -324,7 +324,7 @@ cdef class Lexeme:
         def __get__(self):
             return self.vocab.strings[self.c.suffix]
 
-        def __set__(self, unicode x):
+        def __set__(self, str x):
             self.c.suffix = self.vocab.strings.add(x)
 
     property lang_:
@@ -332,7 +332,7 @@ cdef class Lexeme:
         def __get__(self):
             return self.vocab.strings[self.c.lang]
 
-        def __set__(self, unicode x):
+        def __set__(self, str x):
             self.c.lang = self.vocab.strings.add(x)
 
     property flags:
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 9e0842d59..9593634d7 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -151,9 +151,9 @@ cdef class DependencyMatcher:
         Creates a token key to be used by the matcher
         """
         return self._normalize_key(
-            unicode(key) + DELIMITER + 
-            unicode(pattern_idx) + DELIMITER + 
-            unicode(token_idx)
+            str(key) + DELIMITER +
+            str(pattern_idx) + DELIMITER +
+            str(token_idx)
         )
 
     def add(self, key, patterns, *, on_match=None):
@@ -438,7 +438,7 @@ cdef class DependencyMatcher:
         return candidate_children
 
     def _normalize_key(self, key):
-        if isinstance(key, basestring):
+        if isinstance(key, str):
             return self.vocab.strings.add(key)
         else:
             return key
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 555766f62..6a23d1f4b 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -317,7 +317,7 @@ cdef class Matcher:
             return final_matches
 
     def _normalize_key(self, key):
-        if isinstance(key, basestring):
+        if isinstance(key, str):
             return self.vocab.strings.add(key)
         else:
             return key
@@ -365,7 +365,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
     for i, token in enumerate(doclike):
         for name, index in extensions.items():
             value = token._.get(name)
-            if isinstance(value, basestring):
+            if isinstance(value, str):
                 value = token.vocab.strings[value]
             extra_attr_values[i * nr_extra_attr + index] = value
     # Main loop
@@ -791,7 +791,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
 def _get_attr_values(spec, string_store):
     attr_values = []
     for attr, value in spec.items():
-        if isinstance(attr, basestring):
+        if isinstance(attr, str):
             attr = attr.upper()
             if attr == '_':
                 continue
@@ -802,7 +802,7 @@ def _get_attr_values(spec, string_store):
             if attr == "IS_SENT_START":
                 attr = "SENT_START"
             attr = IDS.get(attr)
-        if isinstance(value, basestring):
+        if isinstance(value, str):
             value = string_store.add(value)
         elif isinstance(value, bool):
             value = int(value)
@@ -943,7 +943,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
     seen_predicates = {pred.key: pred.i for pred in extra_predicates}
     output = []
     for attr, value in spec.items():
-        if isinstance(attr, basestring):
+        if isinstance(attr, str):
             if attr == "_":
                 output.extend(
                     _get_extension_extra_predicates(
@@ -1000,7 +1000,7 @@ def _get_operators(spec):
               "?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
     # Fix casing
     spec = {key.upper(): values for key, values in spec.items()
-            if isinstance(key, basestring)}
+            if isinstance(key, str)}
     if "OP" not in spec:
         return (ONE,)
     elif spec["OP"] in lookup:
@@ -1018,7 +1018,7 @@ def _get_extensions(spec, string_store, name2index):
         if isinstance(value, dict):
             # Handle predicates (e.g. "IN", in the extra_predicates, not here.
             continue
-        if isinstance(value, basestring):
+        if isinstance(value, str):
             value = string_store.add(value)
         if name not in name2index:
             name2index[name] = len(name2index)
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 9ca702f9b..f34975858 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -17,7 +17,7 @@ from ...errors import Errors
 from thinc.extra.search cimport Beam
 
 cdef weight_t MIN_SCORE = -90000
-cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
+cdef attr_t SUBTOK_LABEL = hash_string('subtok')
 
 DEF NON_MONOTONIC = True
 
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index 07768d347..370180135 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64
 from .typedefs cimport attr_t, hash_t
 
 
-cpdef hash_t hash_string(unicode string) except 0
+cpdef hash_t hash_string(str string) except 0
 cdef hash_t hash_utf8(char* utf8_string, int length) nogil
 
-cdef unicode decode_Utf8Str(const Utf8Str* string)
+cdef str decode_Utf8Str(const Utf8Str* string)
 
 
 ctypedef union Utf8Str:
@@ -25,5 +25,5 @@ cdef class StringStore:
     cdef vector[hash_t] keys
     cdef public PreshMap _map
 
-    cdef const Utf8Str* intern_unicode(self, unicode py_string)
+    cdef const Utf8Str* intern_unicode(self, str py_string)
     cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 4a20cb8af..39fc441e9 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -33,7 +33,7 @@ def get_string_id(key):
         return hash_utf8(chars, len(chars))
 
 
-cpdef hash_t hash_string(unicode string) except 0:
+cpdef hash_t hash_string(str string) except 0:
     chars = string.encode("utf8")
     return hash_utf8(chars, len(chars))
 
@@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
     return hash32(utf8_string, length, 1)
 
 
-cdef unicode decode_Utf8Str(const Utf8Str* string):
+cdef str decode_Utf8Str(const Utf8Str* string):
     cdef int i, length
     if string.s[0] < sizeof(string.s) and string.s[0] != 0:
         return string.s[1:string.s[0]+1].decode("utf8")
@@ -107,17 +107,17 @@ cdef class StringStore:
     def __getitem__(self, object string_or_id):
         """Retrieve a string from a given hash, or vice versa.
 
-        string_or_id (bytes, unicode or uint64): The value to encode.
+        string_or_id (bytes, str or uint64): The value to encode.
         Returns (str / uint64): The value to be retrieved.
         """
-        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
+        if isinstance(string_or_id, str) and len(string_or_id) == 0:
             return 0
         elif string_or_id == 0:
             return ""
         elif string_or_id in SYMBOLS_BY_STR:
             return SYMBOLS_BY_STR[string_or_id]
         cdef hash_t key
-        if isinstance(string_or_id, unicode):
+        if isinstance(string_or_id, str):
             key = hash_string(string_or_id)
             return key
         elif isinstance(string_or_id, bytes):
@@ -135,14 +135,14 @@ cdef class StringStore:
 
     def as_int(self, key):
         """If key is an int, return it; otherwise, get the int value."""
-        if not isinstance(key, basestring):
+        if not isinstance(key, str):
             return key
         else:
             return self[key]
 
     def as_string(self, key):
         """If key is a string, return it; otherwise, get the string value."""
-        if isinstance(key, basestring):
+        if isinstance(key, str):
             return key
         else:
             return self[key]
@@ -153,7 +153,7 @@ cdef class StringStore:
         string (str): The string to add.
         RETURNS (uint64): The string's hash value.
         """
-        if isinstance(string, unicode):
+        if isinstance(string, str):
             if string in SYMBOLS_BY_STR:
                 return SYMBOLS_BY_STR[string]
             key = hash_string(string)
@@ -189,7 +189,7 @@ cdef class StringStore:
             return True
         elif string in SYMBOLS_BY_STR:
             return True
-        elif isinstance(string, unicode):
+        elif isinstance(string, str):
             key = hash_string(string)
         else:
             string = string.encode("utf8")
@@ -269,7 +269,7 @@ cdef class StringStore:
         for string in strings:
             self.add(string)
 
-    cdef const Utf8Str* intern_unicode(self, unicode py_string):
+    cdef const Utf8Str* intern_unicode(self, str py_string):
         # 0 means missing, but we don't bother offsetting the index.
         cdef bytes byte_string = py_string.encode("utf8")
         return self._intern_utf8(byte_string, len(byte_string))
diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py
index 91a048764..5cf6eb1a6 100644
--- a/spacy/tests/lang/ky/test_tokenizer.py
+++ b/spacy/tests/lang/ky/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 719e8e6f5..44f6ee522 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -26,7 +26,7 @@ cdef class Tokenizer:
     cdef int _property_init_count  # TODO: unused, remove in v3.1
     cdef int _property_init_max    # TODO: unused, remove in v3.1
 
-    cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
+    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1
     cdef void _filter_special_spans(self, vector[SpanC] &original,
                             vector[SpanC] &filtered, int doc_len) nogil
@@ -37,13 +37,13 @@ cdef class Tokenizer:
     cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
                                      int* has_special,
                                      bint with_special_cases) except -1
-    cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
+    cdef int _tokenize(self, Doc tokens, str span, hash_t key,
                        int* has_special, bint with_special_cases) except -1
-    cdef unicode _split_affixes(self, Pool mem, unicode string,
+    cdef str _split_affixes(self, Pool mem, str string,
                                 vector[LexemeC*] *prefixes,
                                 vector[LexemeC*] *suffixes, int* has_special,
                                 bint with_special_cases)
-    cdef int _attach_tokens(self, Doc tokens, unicode string,
+    cdef int _attach_tokens(self, Doc tokens, str string,
                             vector[LexemeC*] *prefixes,
                             vector[LexemeC*] *suffixes, int* has_special,
                             bint with_special_cases) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 5a89e5a17..c0c8520c7 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,6 +1,4 @@
 # cython: embedsignature=True, profile=True, binding=True
-from __future__ import unicode_literals
-
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from libc.string cimport memcpy, memset
@@ -132,7 +130,7 @@ cdef class Tokenizer:
                 self.url_match)
         return (self.__class__, args, None, None)
 
-    def __call__(self, unicode string):
+    def __call__(self, str string):
         """Tokenize a string.
 
         string (str): The string to tokenize.
@@ -145,7 +143,7 @@ cdef class Tokenizer:
         return doc
 
     @cython.boundscheck(False)
-    cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
+    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases):
         """Tokenize according to affix and token_match settings.
 
         string (str): The string to tokenize.
@@ -161,7 +159,7 @@ cdef class Tokenizer:
         cdef int start = 0
         cdef int has_special = 0
         cdef bint in_ws = string[0].isspace()
-        cdef unicode span
+        cdef str span
         # The task here is much like string.split, but not quite
         # We find spans of whitespace and non-space characters, and ignore
         # spans that are exactly ' '. So, our sequences will all be separated
@@ -373,7 +371,7 @@ cdef class Tokenizer:
             return False
         return True
 
-    cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
+    cdef int _tokenize(self, Doc tokens, str span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
         cdef vector[LexemeC*] prefixes
         cdef vector[LexemeC*] suffixes
         cdef int orig_size
@@ -385,16 +383,16 @@ cdef class Tokenizer:
         self._save_cached(&tokens.c[orig_size], orig_key, has_special,
                           tokens.length - orig_size)
 
-    cdef unicode _split_affixes(self, Pool mem, unicode string,
+    cdef str _split_affixes(self, Pool mem, str string,
                                 vector[const LexemeC*] *prefixes,
                                 vector[const LexemeC*] *suffixes,
                                 int* has_special,
                                 bint with_special_cases):
         cdef size_t i
-        cdef unicode prefix
-        cdef unicode suffix
-        cdef unicode minus_pre
-        cdef unicode minus_suf
+        cdef str prefix
+        cdef str suffix
+        cdef str minus_pre
+        cdef str minus_suf
         cdef size_t last_size = 0
         while string and len(string) != last_size:
             if self.token_match and self.token_match(string):
@@ -430,7 +428,7 @@ cdef class Tokenizer:
                 suffixes.push_back(self.vocab.get(mem, suffix))
         return string
 
-    cdef int _attach_tokens(self, Doc tokens, unicode string,
+    cdef int _attach_tokens(self, Doc tokens, str string,
                             vector[const LexemeC*] *prefixes,
                             vector[const LexemeC*] *suffixes,
                             int* has_special,
@@ -440,7 +438,7 @@ cdef class Tokenizer:
         cdef int split, end
         cdef const LexemeC* const* lexemes
         cdef const LexemeC* lexeme
-        cdef unicode span
+        cdef str span
         cdef int i
         if prefixes.size():
             for i in range(prefixes.size()):
@@ -513,7 +511,7 @@ cdef class Tokenizer:
         cached.data.lexemes = <const LexemeC* const*>lexemes
         self._cache.set(key, cached)
 
-    def find_infix(self, unicode string):
+    def find_infix(self, str string):
         """Find internal split points of the string, such as hyphens.
 
         string (str): The string to segment.
@@ -527,7 +525,7 @@ cdef class Tokenizer:
             return 0
         return list(self.infix_finditer(string))
 
-    def find_prefix(self, unicode string):
+    def find_prefix(self, str string):
         """Find the length of a prefix that should be segmented from the
         string, or None if no prefix rules match.
 
@@ -541,7 +539,7 @@ cdef class Tokenizer:
         match = self.prefix_search(string)
         return (match.end() - match.start()) if match is not None else 0
 
-    def find_suffix(self, unicode string):
+    def find_suffix(self, str string):
         """Find the length of a suffix that should be segmented from the
         string, or None if no suffix rules match.
 
@@ -579,7 +577,7 @@ cdef class Tokenizer:
                 if attr not in (ORTH, NORM):
                     raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))
 
-    def add_special_case(self, unicode string, substrings):
+    def add_special_case(self, str string, substrings):
         """Add a special-case tokenization rule.
 
         string (str): The string to specially tokenize.
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 868eb3eab..2ce329375 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -36,7 +36,7 @@ class DocBin:
         "spans": List[Dict[str, bytes]], # SpanGroups data for each doc
         "spaces": bytes, # Serialized numpy boolean array with spaces data
         "lengths": bytes, # Serialized numpy int32 array with the doc lengths
-        "strings": List[unicode] # List of unique strings in the token data
+        "strings": List[str] # List of unique strings in the token data
         "version": str, # DocBin version number
     }
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index cd2bd6f6c..c4ddd4163 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -260,7 +260,7 @@ cdef class Doc:
             raise ValueError(Errors.E027)
         cdef const LexemeC* lexeme
         for word, has_space in zip(words, spaces):
-            if isinstance(word, unicode):
+            if isinstance(word, str):
                 lexeme = self.vocab.get(self.mem, word)
             elif isinstance(word, bytes):
                 raise ValueError(Errors.E028.format(value=word))
@@ -1362,7 +1362,7 @@ cdef class Doc:
             self.has_unknown_spaces = msg["has_unknown_spaces"]
         start = 0
         cdef const LexemeC* lex
-        cdef unicode orth_
+        cdef str orth_
         text = msg["text"]
         attrs = msg["array_body"]
         for i in range(attrs.shape[0]):
@@ -1423,7 +1423,7 @@ cdef class Doc:
             attributes are inherited from the syntactic root of the span.
         RETURNS (Token): The first newly merged token.
         """
-        cdef unicode tag, lemma, ent_type
+        cdef str tag, lemma, ent_type
         attr_len = len(attributes)
         span_len = len(spans)
         if not attr_len == span_len:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 48c6053c1..5807ff2d2 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
 cimport numpy as np
 from libc.math cimport sqrt
 
@@ -745,7 +743,7 @@ cdef class Span:
         def __get__(self):
             return self.root.ent_id_
 
-        def __set__(self, unicode key):
+        def __set__(self, str key):
             raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
 
     @property
@@ -766,7 +764,7 @@ cdef class Span:
         def __get__(self):
             return self.doc.vocab.strings[self.label]
 
-        def __set__(self, unicode label_):
+        def __set__(self, str label_):
             self.label = self.doc.vocab.strings.add(label_)
 
     property kb_id_:
@@ -774,7 +772,7 @@ cdef class Span:
         def __get__(self):
             return self.doc.vocab.strings[self.kb_id]
 
-        def __set__(self, unicode kb_id_):
+        def __set__(self, str kb_id_):
             self.kb_id = self.doc.vocab.strings.add(kb_id_)
 
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 3fcfda691..8877cf9d0 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -267,7 +267,7 @@ cdef class Token:
         """RETURNS (str): The text content of the span (with trailing
             whitespace).
         """
-        cdef unicode orth = self.vocab.strings[self.c.lex.orth]
+        cdef str orth = self.vocab.strings[self.c.lex.orth]
         if self.c.spacy:
             return orth + " "
         else:
@@ -820,7 +820,7 @@ cdef class Token:
         def __get__(self):
             return self.vocab.strings[self.norm]
 
-        def __set__(self, unicode norm_):
+        def __set__(self, str norm_):
             self.c.norm = self.vocab.strings.add(norm_)
 
     @property
@@ -858,7 +858,7 @@ cdef class Token:
         def __get__(self):
             return self.vocab.strings[self.c.lemma]
 
-        def __set__(self, unicode lemma_):
+        def __set__(self, str lemma_):
             self.c.lemma = self.vocab.strings.add(lemma_)
 
     property pos_:
@@ -890,7 +890,7 @@ cdef class Token:
         def __get__(self):
             return self.vocab.strings[self.c.dep]
 
-        def __set__(self, unicode label):
+        def __set__(self, str label):
             self.c.dep = self.vocab.strings.add(label)
 
     @property
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 9067476f7..9b556247b 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -36,12 +36,12 @@ cdef class Vocab:
     cdef public object lex_attr_getters
     cdef public object cfg
 
-    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
+    cdef const LexemeC* get(self, Pool mem, str string) except NULL
     cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
     cdef const TokenC* make_fused_token(self, substrings) except NULL
 
-    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
+    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
-    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
+    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
 
     cdef PreshMap _by_orth
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 13dd675af..552898a98 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -60,7 +60,7 @@ cdef class Vocab:
             vice versa.
         lookups (Lookups): Container for large lookup tables and dictionaries.
         oov_prob (float): Default OOV probability.
-        vectors_name (unicode): Optional name to identify the vectors table.
+        vectors_name (str): Optional name to identify the vectors table.
         get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]):
             A function that yields base noun phrases used for Doc.noun_chunks.
         """
@@ -105,7 +105,7 @@ cdef class Vocab:
         See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
         `Token.check_flag`.
 
-        flag_getter (callable): A function `f(unicode) -> bool`, to get the
+        flag_getter (callable): A function `f(str) -> bool`, to get the
             flag value.
         flag_id (int): An integer between 1 and 63 (inclusive), specifying
             the bit at which the flag will be stored. If -1, the lowest
@@ -128,7 +128,7 @@ cdef class Vocab:
         self.lex_attr_getters[flag_id] = flag_getter
         return flag_id
 
-    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
+    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
         """Get a pointer to a `LexemeC` from the lexicon, creating a new
         `Lexeme` if necessary using memory acquired from the given pool. If the
         pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@@ -162,7 +162,7 @@ cdef class Vocab:
         else:
             return self._new_lexeme(mem, self.strings[orth])
 
-    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
+    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
         # I think this heuristic is bad, and the Vocab should always
         # own the lexemes. It avoids weird bugs this way, as it's how the thing
         # was originally supposed to work. The best solution to the growing
@@ -184,7 +184,7 @@ cdef class Vocab:
         if self.lex_attr_getters is not None:
             for attr, func in self.lex_attr_getters.items():
                 value = func(string)
-                if isinstance(value, unicode):
+                if isinstance(value, str):
                     value = self.strings.add(value)
                 if value is not None:
                     Lexeme.set_struct_attr(lex, attr, value)
@@ -201,7 +201,7 @@ cdef class Vocab:
     def __contains__(self, key):
         """Check whether the string or int key has an entry in the vocabulary.
 
-        string (unicode): The ID string.
+        string (str): The ID string.
         RETURNS (bool) Whether the string has an entry in the vocabulary.
 
         DOCS: https://spacy.io/api/vocab#contains
@@ -209,7 +209,7 @@ cdef class Vocab:
         cdef hash_t int_key
         if isinstance(key, bytes):
             int_key = self.strings[key.decode("utf8")]
-        elif isinstance(key, unicode):
+        elif isinstance(key, str):
             int_key = self.strings[key]
         else:
             int_key = key
@@ -234,7 +234,7 @@ cdef class Vocab:
         previously unseen unicode string is given, a new lexeme is created and
         stored.
 
-        id_or_string (int or unicode): The integer ID of a word, or its unicode
+        id_or_string (int or str): The integer ID of a word, or its unicode
             string. If `int >= Lexicon.size`, `IndexError` is raised. If
             `id_or_string` is neither an int nor a unicode string, `ValueError`
             is raised.
@@ -247,7 +247,7 @@ cdef class Vocab:
         DOCS: https://spacy.io/api/vocab#getitem
         """
         cdef attr_t orth
-        if isinstance(id_or_string, unicode):
+        if isinstance(id_or_string, str):
             orth = self.strings.add(id_or_string)
         else:
             orth = id_or_string
@@ -348,7 +348,7 @@ cdef class Vocab:
         If `minn` is defined, then the resulting vector uses Fasttext's
         subword features by average over ngrams of `orth`.
 
-        orth (int / unicode): The hash value of a word, or its unicode string.
+        orth (int / str): The hash value of a word, or its unicode string.
         minn (int): Minimum n-gram length used for Fasttext's ngram computation.
             Defaults to the length of `orth`.
         maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
@@ -401,7 +401,7 @@ cdef class Vocab:
         """Set a vector for a word in the vocabulary. Words can be referenced
         by string or int ID.
 
-        orth (int / unicode): The word.
+        orth (int / str): The word.
         vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set.
 
         DOCS: https://spacy.io/api/vocab#set_vector
@@ -423,7 +423,7 @@ cdef class Vocab:
         """Check whether a word has a vector. Returns False if no vectors have
         been loaded. Words can be looked up by string or int ID.
 
-        orth (int / unicode): The word.
+        orth (int / str): The word.
         RETURNS (bool): Whether the word has a vector.
 
         DOCS: https://spacy.io/api/vocab#has_vector
@@ -448,7 +448,7 @@ cdef class Vocab:
     def to_disk(self, path, *, exclude=tuple()):
         """Save the current state to a directory.
 
-        path (unicode or Path): A path to a directory, which will be created if
+        path (str or Path): A path to a directory, which will be created if
             it doesn't exist.
         exclude (list): String names of serialization fields to exclude.
 
@@ -469,7 +469,7 @@ cdef class Vocab:
         """Loads state from a directory. Modifies the object in place and
         returns it.
 
-        path (unicode or Path): A path to a directory.
+        path (str or Path): A path to a directory.
         exclude (list): String names of serialization fields to exclude.
         RETURNS (Vocab): The modified `Vocab` object.
 

From cd75f96501bf4687b168bc7fe1b30d9b1223f06f Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 16 Sep 2021 06:07:21 +0900
Subject: [PATCH 009/133] Remove two attributes marked for removal in 3.1
 (#9150)

* Remove two attributes marked for removal in 3.1

* Add back unused ints with changed names

* Change data_dir to _unused_object

This is still kept in the type definition, but I removed it from the
serialization code.

* Put serialization code back for now

Not sure how this interacts with old serialized models yet.
---
 spacy/tokenizer.pxd | 6 ++++--
 spacy/vocab.pxd     | 2 +-
 spacy/vocab.pyi     | 2 +-
 spacy/vocab.pyx     | 8 ++++----
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 44f6ee522..fa38a1015 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -23,8 +23,10 @@ cdef class Tokenizer:
     cdef object _infix_finditer
     cdef object _rules
     cdef PhraseMatcher _special_matcher
-    cdef int _property_init_count  # TODO: unused, remove in v3.1
-    cdef int _property_init_max    # TODO: unused, remove in v3.1
+    # TODO next two are unused and should be removed in v4
+    # https://github.com/explosion/spaCy/pull/9150
+    cdef int _unused_int1
+    cdef int _unused_int2
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 9b556247b..b28ad3e85 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -32,7 +32,7 @@ cdef class Vocab:
     cdef public object writing_system
     cdef public object get_noun_chunks
     cdef readonly int length
-    cdef public object data_dir
+    cdef public object _unused_object # TODO remove in v4, see #9150
     cdef public object lex_attr_getters
     cdef public object cfg
 
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index 0a8ef6198..7c0d0598e 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -71,7 +71,7 @@ def unpickle_vocab(
     sstore: StringStore,
     vectors: Any,
     morphology: Any,
-    data_dir: Any,
+    _unused_object: Any,
     lex_attr_getters: Any,
     lookups: Any,
     get_noun_chunks: Any,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 552898a98..402528f28 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -552,21 +552,21 @@ def pickle_vocab(vocab):
     sstore = vocab.strings
     vectors = vocab.vectors
     morph = vocab.morphology
-    data_dir = vocab.data_dir
+    _unused_object = vocab._unused_object
     lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
     lookups = vocab.lookups
     get_noun_chunks = vocab.get_noun_chunks
     return (unpickle_vocab,
-            (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, get_noun_chunks))
+            (sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks))
 
 
-def unpickle_vocab(sstore, vectors, morphology, data_dir,
+def unpickle_vocab(sstore, vectors, morphology, _unused_object,
                    lex_attr_getters, lookups, get_noun_chunks):
     cdef Vocab vocab = Vocab()
     vocab.vectors = vectors
     vocab.strings = sstore
     vocab.morphology = morphology
-    vocab.data_dir = data_dir
+    vocab._unused_object = _unused_object
     vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
     vocab.lookups = lookups
     vocab.get_noun_chunks = get_noun_chunks

From 2f0bb7792081f9f0ab8caddaddf305244d7775d5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 22 Sep 2021 09:41:05 +0200
Subject: [PATCH 010/133] Accept Doc input in pipelines (#9069)

* Accept Doc input in pipelines

Allow `Doc` input to `Language.__call__` and `Language.pipe`, which
skips `Language.make_doc` and passes the doc directly to the pipeline.

* ensure_doc helper function

* avoid running multiple processes on GPU

* Update spacy/tests/test_language.py

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/errors.py              |  1 +
 spacy/language.py            | 34 +++++++++++++++++++++++-----------
 spacy/tests/test_language.py | 26 ++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 9264ca6d1..f1c068793 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -521,6 +521,7 @@ class Errors:
     E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
+    E866 = ("Expected a string or 'Doc' as input, but got: {type}.")
     E867 = ("The 'textcat' component requires at least two labels because it "
             "uses mutually exclusive classes where exactly one label is True "
             "for each doc. For binary classification tasks, you can use two "
diff --git a/spacy/language.py b/spacy/language.py
index a8cad1259..540937e66 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -968,7 +968,7 @@ class Language:
 
     def __call__(
         self,
-        text: str,
+        text: Union[str, Doc],
         *,
         disable: Iterable[str] = SimpleFrozenList(),
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
@@ -977,7 +977,9 @@ class Language:
         and can contain arbitrary whitespace. Alignment into the original string
         is preserved.
 
-        text (str): The text to be processed.
+        text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
+            the doc will be passed directly to the pipeline, skipping
+            `Language.make_doc`.
         disable (list): Names of the pipeline components to disable.
         component_cfg (Dict[str, dict]): An optional dictionary with extra
             keyword arguments for specific components.
@@ -985,7 +987,7 @@ class Language:
 
         DOCS: https://spacy.io/api/language#call
         """
-        doc = self.make_doc(text)
+        doc = self._ensure_doc(text)
         if component_cfg is None:
             component_cfg = {}
         for name, proc in self.pipeline:
@@ -1069,6 +1071,14 @@ class Language:
             )
         return self.tokenizer(text)
 
+    def _ensure_doc(self, doc_like: Union[str, Doc]) -> Doc:
+        """Create a Doc if need be, or raise an error if the input is not a Doc or a string."""
+        if isinstance(doc_like, Doc):
+            return doc_like
+        if isinstance(doc_like, str):
+            return self.make_doc(doc_like)
+        raise ValueError(Errors.E866.format(type=type(doc_like)))
+
     def update(
         self,
         examples: Iterable[Example],
@@ -1437,7 +1447,7 @@ class Language:
     @overload
     def pipe(
         self,
-        texts: Iterable[Tuple[str, _AnyContext]],
+        texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
         *,
         as_tuples: bool = ...,
         batch_size: Optional[int] = ...,
@@ -1449,7 +1459,7 @@ class Language:
 
     def pipe(  # noqa: F811
         self,
-        texts: Iterable[str],
+        texts: Iterable[Union[str, Doc]],
         *,
         as_tuples: bool = False,
         batch_size: Optional[int] = None,
@@ -1459,7 +1469,8 @@ class Language:
     ) -> Iterator[Doc]:
         """Process texts as a stream, and yield `Doc` objects in order.
 
-        texts (Iterable[str]): A sequence of texts to process.
+        texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
+            process.
         as_tuples (bool): If set to True, inputs should be a sequence of
             (text, context) tuples. Output will then be a sequence of
             (doc, context) tuples. Defaults to False.
@@ -1515,7 +1526,7 @@ class Language:
             docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
         else:
             # if n_process == 1, no processes are forked.
-            docs = (self.make_doc(text) for text in texts)
+            docs = (self._ensure_doc(text) for text in texts)
             for pipe in pipes:
                 docs = pipe(docs)
         for doc in docs:
@@ -1549,7 +1560,7 @@ class Language:
         procs = [
             mp.Process(
                 target=_apply_pipes,
-                args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
+                args=(self._ensure_doc, pipes, rch, sch, Underscore.get_state()),
             )
             for rch, sch in zip(texts_q, bytedocs_send_ch)
         ]
@@ -2084,7 +2095,7 @@ def _copy_examples(examples: Iterable[Example]) -> List[Example]:
 
 
 def _apply_pipes(
-    make_doc: Callable[[str], Doc],
+    ensure_doc: Callable[[Union[str, Doc]], Doc],
     pipes: Iterable[Callable[[Doc], Doc]],
     receiver,
     sender,
@@ -2092,7 +2103,8 @@ def _apply_pipes(
 ) -> None:
     """Worker for Language.pipe
 
-    make_doc (Callable[[str,] Doc]): Function to create Doc from text.
+    ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
+        or raise an error if the input is neither a Doc nor a string.
     pipes (Iterable[Callable[[Doc], Doc]]): The components to apply.
     receiver (multiprocessing.Connection): Pipe to receive text. Usually
         created by `multiprocessing.Pipe()`
@@ -2105,7 +2117,7 @@ def _apply_pipes(
     while True:
         try:
             texts = receiver.get()
-            docs = (make_doc(text) for text in texts)
+            docs = (ensure_doc(text) for text in texts)
             for pipe in pipes:
                 docs = pipe(docs)
             # Connection does not accept unpickable objects, so send list.
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index c911b8d81..e3c25fece 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -528,3 +528,29 @@ def test_language_source_and_vectors(nlp2):
     assert long_string in nlp2.vocab.strings
     # vectors should remain unmodified
     assert nlp.vocab.vectors.to_bytes() == vectors_bytes
+
+
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_pass_doc_to_pipeline(nlp, n_process):
+    texts = ["cats", "dogs", "guinea pigs"]
+    docs = [nlp.make_doc(text) for text in texts]
+    assert not any(len(doc.cats) for doc in docs)
+    doc = nlp(docs[0])
+    assert doc.text == texts[0]
+    assert len(doc.cats) > 0
+    if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
+        docs = nlp.pipe(docs, n_process=n_process)
+        assert [doc.text for doc in docs] == texts
+        assert all(len(doc.cats) for doc in docs)
+
+
+def test_invalid_arg_to_pipeline(nlp):
+    str_list = ["This is a text.", "This is another."]
+    with pytest.raises(ValueError):
+        nlp(str_list)  # type: ignore
+    assert len(list(nlp.pipe(str_list))) == 2
+    int_list = [1, 2, 3]
+    with pytest.raises(ValueError):
+        list(nlp.pipe(int_list))  # type: ignore
+    with pytest.raises(ValueError):
+        nlp(int_list)  # type: ignore

From 68264b4cee01178860dfd90b1f8034e8d20d5976 Mon Sep 17 00:00:00 2001
From: Rumesh Madhusanka <32504465+rumeshmadhusanka@users.noreply.github.com>
Date: Thu, 23 Sep 2021 00:13:42 +0530
Subject: [PATCH 011/133] Updating the stop word list for Sinhala language
 (#9270)

---
 spacy/lang/si/stop_words.py | 224 ++++++++++++++++++++++++++++++------
 1 file changed, 186 insertions(+), 38 deletions(-)

diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py
index bde662bf7..7d29bc1b4 100644
--- a/spacy/lang/si/stop_words.py
+++ b/spacy/lang/si/stop_words.py
@@ -1,47 +1,195 @@
 STOP_WORDS = set(
     """
-අතර
-එච්චර
-එපමණ
-එලෙස
-එවිට
-ඒ
-කට
-කදී
-කින්
-ක්
-ට
-තුර
-ත්
-ද
-නමුත්
-නොහොත්
-පමණ
-පමණි
-ම
-මෙච්චර
-මෙපමණ
-මෙලෙස
-මෙවිට
-මේ
-ය
-යි
-ලදී
+සහ
+සමග
+සමඟ
+අහා
+ආහ්
+ආ
+ඕහෝ
+අනේ
+අඳෝ
+අපොයි
+අපෝ
+අයියෝ
+ආයි
+ඌයි
+චී
+චිහ්
+චික්
+හෝ‍
+දෝ
+දෝහෝ
+මෙන්
+සේ
+වැනි
+බඳු
+වන්
+අයුරු
+අයුරින්
 ලෙස
-වගේ
+වැඩි
+ශ්‍රී
+හා
+ය
+නිසා
+නිසාවෙන්
+බවට
+බව
+බවෙන්
+නම්
+වැඩි
+සිට
+දී
+මහා
+මහ
+පමණ
+පමණින්
+පමන
 වන
 විට
-විටෙක
-විතර
-විය
-වුව
-වුවත්
-වුවද
-වූ
-සමඟ
+විටින්
+මේ
+මෙලෙස
+මෙයින්
+ඇති
+ලෙස
+සිදු
+වශයෙන්
+යන
+සඳහා
+මගින්
+හෝ‍
+ඉතා
+ඒ
+එම
+ද
+අතර
+විසින්
+සමග
+පිළිබඳව
+පිළිබඳ
+තුළ
+බව
+වැනි
+මහ
+මෙම
+මෙහි
+මේ
+වෙත
+වෙතින්
+වෙතට
+වෙනුවෙන්
+වෙනුවට
+වෙන
+ගැන
+නෑ
+අනුව
+නව
+පිළිබඳ
+විශේෂ
+දැනට
+එහෙන්
+මෙහෙන්
+එහේ
+මෙහේ
+ම
+තවත්
+තව 
 සහ
-හා
+දක්වා
+ට
+ගේ
+එ
+ක
+ක්
+බවත්
+බවද
+මත
+ඇතුලු
+ඇතුළු
+මෙසේ
+වඩා
+වඩාත්ම
+නිති
+නිතිත්
+නිතොර
+නිතර
+ඉක්බිති
+දැන්
+යලි
+පුන
+ඉතින්
+සිට
+සිටන්
+පටන්
+තෙක්
+දක්වා
+සා
+තාක්
+තුවක්
+පවා
+ද
+හෝ‍
+වත්
+විනා
+හැර
+මිස
+මුත්
+කිම
+කිම්
+ඇයි
+මන්ද
 හෙවත්
-හෝ
+නොහොත්
+පතා
+පාසා
+ගානෙ
+තව
+ඉතා
+බොහෝ
+වහා
+සෙද
+සැනින්
+හනික
+එම්බා
+එම්බල
+බොල
+නම්
+වනාහි
+කලී
+ඉඳුරා
+අන්න
+ඔන්න
+මෙන්න
+උදෙසා
+පිණිස
+සඳහා
+අරබයා
+නිසා
+එනිසා
+එබැවින්
+බැවින්
+හෙයින්
+සේක්
+සේක
+ගැන
+අනුව
+පරිදි
+විට
+තෙක්
+මෙතෙක්
+මේතාක්
+තුරු
+තුරා
+තුරාවට
+තුලින්
+නමුත්
+එනමුත්
+වස්
+මෙන්
+ලෙස
+පරිදි
+එහෙත්
 """.split()
 )

From 5eced281d861134c372eda6610e9aba9f0e5d8ca Mon Sep 17 00:00:00 2001
From: Jette16 <33116335+Jette16@users.noreply.github.com>
Date: Thu, 23 Sep 2021 14:31:42 +0200
Subject: [PATCH 012/133] Add universe test (#9278)

* Added test for universe.json

* Added contributor agreement

* Ran black on test_universe_json.py
---
 .github/contributors/Jette16.md            | 106 +++++++++++++++++++++
 .gitignore                                 |   1 +
 setup.py                                   |   1 +
 spacy/tests/universe/test_universe_json.py |  17 ++++
 4 files changed, 125 insertions(+)
 create mode 100644 .github/contributors/Jette16.md
 create mode 100644 spacy/tests/universe/test_universe_json.py

diff --git a/.github/contributors/Jette16.md b/.github/contributors/Jette16.md
new file mode 100644
index 000000000..c064f1d4f
--- /dev/null
+++ b/.github/contributors/Jette16.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |  Henriette Behr      |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |  23.09.2021          |
+| GitHub username                |  Jette16             |
+| Website (optional)             |                      |
diff --git a/.gitignore b/.gitignore
index ac72f2bbf..60036a475 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ keys/
 spacy/tests/package/setup.cfg
 spacy/tests/package/pyproject.toml
 spacy/tests/package/requirements.txt
+spacy/tests/universe/universe.json
 
 # Website
 website/.cache/
diff --git a/setup.py b/setup.py
index fcc124a43..03a1e01dd 100755
--- a/setup.py
+++ b/setup.py
@@ -81,6 +81,7 @@ COPY_FILES = {
     ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
     ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
     ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
+    ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
 }
 
 
diff --git a/spacy/tests/universe/test_universe_json.py b/spacy/tests/universe/test_universe_json.py
new file mode 100644
index 000000000..295889186
--- /dev/null
+++ b/spacy/tests/universe/test_universe_json.py
@@ -0,0 +1,17 @@
+import json
+import re
+from pathlib import Path
+
+
+def test_universe_json():
+
+    root_dir = Path(__file__).parent
+    universe_file = root_dir / "universe.json"
+
+    with universe_file.open() as f:
+        universe_data = json.load(f)
+        for entry in universe_data["resources"]:
+            if "github" in entry:
+                assert not re.match(
+                    r"^(http:)|^(https:)", entry["github"]
+                ), "Github field should be user/repo, not a url"

From 12ab49342c2be36ade4b55cb851471a7a93a5f6f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 27 Sep 2021 09:16:31 +0200
Subject: [PATCH 013/133] Sync requirements in setup.cfg

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index ff12d511a..fe484f92e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -44,7 +44,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.9,<8.1.0
+    thinc>=8.0.10,<8.1.0
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.1,<1.1.0
     srsly>=2.4.1,<3.0.0

From fe5f5d6ac66c4e9dafd7c14209039f02082e1b3c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 27 Sep 2021 14:42:30 +0200
Subject: [PATCH 014/133] Update Catalan tokenizer (#9297)

* Update Makefile

For more recent python version

* updated for bsc changes

New tokenization changes

* Update test_text.py

* updating tests and requirements

* changed failed test in test/lang/ca

changed failed test in test/lang/ca

* Update .gitignore

deleted stashed changes line

* back to python 3.6 and remove transformer requirements

As per request

* Update test_exception.py

Change the test

* Update test_exception.py

Remove test print

* Update Makefile

For more recent python version

* updated for bsc changes

New tokenization changes

* updating tests and requirements

* Update requirements.txt

Removed spacy-transfromers from requirements

* Update test_exception.py

Added final punctuation to ensure consistency

* Update Makefile

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Format

* Update test to check all tokens

Co-authored-by: cayorodriguez <crodriguezp@gmail.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/lang/ca/__init__.py                     |  3 ++-
 spacy/lang/ca/punctuation.py                  | 11 ++++++++++
 spacy/lang/ca/tokenizer_exceptions.py         | 21 +++++++++++++++++++
 spacy/tests/lang/ca/test_exception.py         | 19 +++++++++++++----
 .../tests/lang/ca/test_prefix_suffix_infix.py |  9 +++++++-
 spacy/tests/lang/ca/test_text.py              |  7 +++++--
 6 files changed, 62 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 spacy/lang/ca/__init__.py
 mode change 100644 => 100755 spacy/lang/ca/punctuation.py
 mode change 100644 => 100755 spacy/lang/ca/tokenizer_exceptions.py

diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py
old mode 100644
new mode 100755
index 15d395c12..802c7e4cc
--- a/spacy/lang/ca/__init__.py
+++ b/spacy/lang/ca/__init__.py
@@ -3,7 +3,7 @@ from typing import Optional, Callable
 from thinc.api import Model
 
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
@@ -15,6 +15,7 @@ class CatalanDefaults(Language.Defaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
     suffixes = TOKENIZER_SUFFIXES
+    prefixes = TOKENIZER_PREFIXES
     stop_words = STOP_WORDS
     lex_attr_getters = LEX_ATTRS
     syntax_iterators = SYNTAX_ITERATORS
diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py
old mode 100644
new mode 100755
index 39db08f17..8e2f09828
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@@ -1,4 +1,5 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import LIST_CURRENCY
 from ..char_classes import CURRENCY
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 from ..char_classes import merge_chars, _units
@@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units
 
 ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 
+_prefixes = (
+    ["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_CURRENCY
+    + LIST_ICONS
+)
 
 _infixes = (
     LIST_ELLIPSES
@@ -18,6 +27,7 @@ _infixes = (
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
+        r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
     ]
 )
 
@@ -44,3 +54,4 @@ _suffixes = (
 
 TOKENIZER_INFIXES = _infixes
 TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_PREFIXES = _prefixes
diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py
old mode 100644
new mode 100755
index 5f9a50f5e..b261b3498
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@@ -18,12 +18,21 @@ for exc_data in [
     {ORTH: "nov.", NORM: "novembre"},
     {ORTH: "dec.", NORM: "desembre"},
     {ORTH: "Dr.", NORM: "doctor"},
+    {ORTH: "Dra.", NORM: "doctora"},
     {ORTH: "Sr.", NORM: "senyor"},
     {ORTH: "Sra.", NORM: "senyora"},
     {ORTH: "Srta.", NORM: "senyoreta"},
     {ORTH: "núm", NORM: "número"},
     {ORTH: "St.", NORM: "sant"},
     {ORTH: "Sta.", NORM: "santa"},
+    {ORTH: "pl.", NORM: "plaça"},
+    {ORTH: "à."},
+    {ORTH: "è."},
+    {ORTH: "é."},
+    {ORTH: "í."},
+    {ORTH: "ò."},
+    {ORTH: "ó."},
+    {ORTH: "ú."},
     {ORTH: "'l"},
     {ORTH: "'ls"},
     {ORTH: "'m"},
@@ -34,6 +43,18 @@ for exc_data in [
 ]:
     _exc[exc_data[ORTH]] = [exc_data]
 
+_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
+_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
+
+_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
+_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
+
+_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
+_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
+
+_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
+
+
 # Times
 _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
 
diff --git a/spacy/tests/lang/ca/test_exception.py b/spacy/tests/lang/ca/test_exception.py
index cfb574b63..499027ab1 100644
--- a/spacy/tests/lang/ca/test_exception.py
+++ b/spacy/tests/lang/ca/test_exception.py
@@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
 
 
 def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
-    text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda."
-    tokens = ca_tokenizer(text)
-    assert len(tokens) == 15
-    assert tokens[7].text == "aprox."
+    text = "La Dra. Puig viu a la pl. dels Til·lers."
+    doc = ca_tokenizer(text)
+    assert [t.text for t in doc] == [
+        "La",
+        "Dra.",
+        "Puig",
+        "viu",
+        "a",
+        "la",
+        "pl.",
+        "d",
+        "els",
+        "Til·lers",
+        ".",
+    ]
diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
index a3c76ab5b..afbdf3696 100644
--- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
@@ -2,7 +2,14 @@ import pytest
 
 
 @pytest.mark.parametrize(
-    "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])]
+    "text,expected_tokens",
+    [
+        ("d'un", ["d'", "un"]),
+        ("s'ha", ["s'", "ha"]),
+        ("del", ["d", "el"]),
+        ("cantar-te", ["cantar", "-te"]),
+        ("-hola", ["-", "hola"]),
+    ],
 )
 def test_contractions(ca_tokenizer, text, expected_tokens):
     """Test that the contractions are split into two tokens"""
diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py
index 55bad0e94..5db7af553 100644
--- a/spacy/tests/lang/ca/test_text.py
+++ b/spacy/tests/lang/ca/test_text.py
@@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
     una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
 
     tokens = ca_tokenizer(text)
-    assert len(tokens) == 140
+    assert len(tokens) == 146
 
 
 @pytest.mark.parametrize(
     "text,length",
     [
-        ("Perquè va anar-hi?", 4),
+        ("Perquè va anar-hi?", 5),
+        ("El cotxe dels veins.", 6),
         ("“Ah no?”", 5),
         ("""Sí! "Anem", va contestar el Joan Carles""", 11),
         ("Van córrer aprox. 10km", 5),
         ("Llavors perqué...", 3),
+        ("Vull parlar-te'n demà al matí", 8),
+        ("Vull explicar-t'ho demà al matí", 8),
     ],
 )
 def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):

From 5b0b0ca8094650a93dbf75182ee5983190820215 Mon Sep 17 00:00:00 2001
From: "Elia Robyn Lake (Robyn Speer)" <gh@arborelia.net>
Date: Wed, 29 Sep 2021 05:12:50 -0400
Subject: [PATCH 015/133] Move WandB loggers into spacy-loggers (#9223)

* factor out the WandB logger into spacy-loggers

Signed-off-by: Elia Robyn Speer <gh@arborelia.net>

* depend on spacy-loggers so they are available

Signed-off-by: Elia Robyn Speer <gh@arborelia.net>

* remove docs of spacy.WandbLogger.v2 (moved to spacy-loggers)

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Version number suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* update references to WandbLogger

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* make order of deps more consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 requirements.txt               |   1 +
 setup.cfg                      |   1 +
 spacy/training/__init__.py     |   2 +-
 spacy/training/loggers.py      | 164 ---------------------------------
 website/docs/api/top-level.md  |  68 ++------------
 website/docs/usage/projects.md |  16 ++--
 website/docs/usage/training.md |   4 +-
 7 files changed, 21 insertions(+), 235 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 12fdf650f..6f9addbe9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 # Our libraries
 spacy-legacy>=3.0.8,<3.1.0
+spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.10,<8.1.0
diff --git a/setup.cfg b/setup.cfg
index fe484f92e..45fa48ce5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -41,6 +41,7 @@ setup_requires =
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.8,<3.1.0
+    spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 055f30f42..22f1e64b1 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -7,5 +7,5 @@ from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F40
 from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401
 from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
-from .loggers import console_logger, wandb_logger  # noqa: F401
+from .loggers import console_logger  # noqa: F401
 from .callbacks import create_copy_from_base_model  # noqa: F401
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 137e89e56..d80c77b6a 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -4,7 +4,6 @@ import tqdm
 import sys
 
 from ..util import registry
-from .. import util
 from ..errors import Errors
 
 if TYPE_CHECKING:
@@ -100,166 +99,3 @@ def console_logger(progress_bar: bool = False):
 
     return setup_printer
 
-
-@registry.loggers("spacy.WandbLogger.v2")
-def wandb_logger(
-    project_name: str,
-    remove_config_values: List[str] = [],
-    model_log_interval: Optional[int] = None,
-    log_dataset_dir: Optional[str] = None,
-):
-    try:
-        import wandb
-
-        # test that these are available
-        from wandb import init, log, join  # noqa: F401
-    except ImportError:
-        raise ImportError(Errors.E880)
-
-    console = console_logger(progress_bar=False)
-
-    def setup_logger(
-        nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
-    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
-        config = nlp.config.interpolate()
-        config_dot = util.dict_to_dot(config)
-        for field in remove_config_values:
-            del config_dot[field]
-        config = util.dot_to_dict(config_dot)
-        run = wandb.init(project=project_name, config=config, reinit=True)
-        console_log_step, console_finalize = console(nlp, stdout, stderr)
-
-        def log_dir_artifact(
-            path: str,
-            name: str,
-            type: str,
-            metadata: Optional[Dict[str, Any]] = {},
-            aliases: Optional[List[str]] = [],
-        ):
-            dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
-            dataset_artifact.add_dir(path, name=name)
-            wandb.log_artifact(dataset_artifact, aliases=aliases)
-
-        if log_dataset_dir:
-            log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
-
-        def log_step(info: Optional[Dict[str, Any]]):
-            console_log_step(info)
-            if info is not None:
-                score = info["score"]
-                other_scores = info["other_scores"]
-                losses = info["losses"]
-                wandb.log({"score": score})
-                if losses:
-                    wandb.log({f"loss_{k}": v for k, v in losses.items()})
-                if isinstance(other_scores, dict):
-                    wandb.log(other_scores)
-                if model_log_interval and info.get("output_path"):
-                    if info["step"] % model_log_interval == 0 and info["step"] != 0:
-                        log_dir_artifact(
-                            path=info["output_path"],
-                            name="pipeline_" + run.id,
-                            type="checkpoint",
-                            metadata=info,
-                            aliases=[
-                                f"epoch {info['epoch']} step {info['step']}",
-                                "latest",
-                                "best"
-                                if info["score"] == max(info["checkpoints"])[0]
-                                else "",
-                            ],
-                        )
-
-        def finalize() -> None:
-            console_finalize()
-            wandb.join()
-
-        return log_step, finalize
-
-    return setup_logger
-
-
-@registry.loggers("spacy.WandbLogger.v3")
-def wandb_logger(
-    project_name: str,
-    remove_config_values: List[str] = [],
-    model_log_interval: Optional[int] = None,
-    log_dataset_dir: Optional[str] = None,
-    entity: Optional[str] = None,
-    run_name: Optional[str] = None,
-):
-    try:
-        import wandb
-
-        # test that these are available
-        from wandb import init, log, join  # noqa: F401
-    except ImportError:
-        raise ImportError(Errors.E880)
-
-    console = console_logger(progress_bar=False)
-
-    def setup_logger(
-        nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
-    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
-        config = nlp.config.interpolate()
-        config_dot = util.dict_to_dot(config)
-        for field in remove_config_values:
-            del config_dot[field]
-        config = util.dot_to_dict(config_dot)
-        run = wandb.init(
-            project=project_name, config=config, entity=entity, reinit=True
-        )
-
-        if run_name:
-            wandb.run.name = run_name
-
-        console_log_step, console_finalize = console(nlp, stdout, stderr)
-
-        def log_dir_artifact(
-            path: str,
-            name: str,
-            type: str,
-            metadata: Optional[Dict[str, Any]] = {},
-            aliases: Optional[List[str]] = [],
-        ):
-            dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
-            dataset_artifact.add_dir(path, name=name)
-            wandb.log_artifact(dataset_artifact, aliases=aliases)
-
-        if log_dataset_dir:
-            log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
-
-        def log_step(info: Optional[Dict[str, Any]]):
-            console_log_step(info)
-            if info is not None:
-                score = info["score"]
-                other_scores = info["other_scores"]
-                losses = info["losses"]
-                wandb.log({"score": score})
-                if losses:
-                    wandb.log({f"loss_{k}": v for k, v in losses.items()})
-                if isinstance(other_scores, dict):
-                    wandb.log(other_scores)
-                if model_log_interval and info.get("output_path"):
-                    if info["step"] % model_log_interval == 0 and info["step"] != 0:
-                        log_dir_artifact(
-                            path=info["output_path"],
-                            name="pipeline_" + run.id,
-                            type="checkpoint",
-                            metadata=info,
-                            aliases=[
-                                f"epoch {info['epoch']} step {info['step']}",
-                                "latest",
-                                "best"
-                                if info["score"] == max(info["checkpoints"])[0]
-                                else "",
-                            ],
-                        )
-
-        def finalize() -> None:
-            console_finalize()
-            wandb.join()
-
-        return log_step, finalize
-
-    return setup_logger
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index f9490803f..48c16e559 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -411,10 +411,13 @@ finished. To log each training step, a
 [`spacy train`](/api/cli#train), including information such as the training loss
 and the accuracy scores on the development set.
 
-There are two built-in logging functions: a logger printing results to the
-console in tabular format (which is the default), and one that also sends the
-results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
-using one of the built-in loggers listed here, you can also
+The built-in, default logger is the ConsoleLogger, which prints results to the
+console in tabular format. The 
+[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
+a dependency of spaCy, enables other loggers: currently it provides one that sends
+results to a [Weights & Biases](https://www.wandb.com/) dashboard.
+
+Instead of using one of the built-in loggers, you can
 [implement your own](/usage/training#custom-logging).
 
 #### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
@@ -463,63 +466,6 @@ start decreasing across epochs.
 
  </Accordion>
 
-#### spacy.WandbLogger.v3 {#WandbLogger tag="registered function"}
-
-> #### Installation
->
-> ```bash
-> $ pip install wandb
-> $ wandb login
-> ```
-
-Built-in logger that sends the results of each training step to the dashboard of
-the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
-& Biases should be installed, and you should be logged in. The logger will send
-the full config file to W&B, as well as various system information such as
-memory utilization, network traffic, disk IO, GPU statistics, etc. This will
-also include information such as your hostname and operating system, as well as
-the location of your Python executable.
-
-<Infobox variant="warning">
-
-Note that by default, the full (interpolated)
-[training config](/usage/training#config) is sent over to the W&B dashboard. If
-you prefer to **exclude certain information** such as path names, you can list
-those fields in "dot notation" in the `remove_config_values` parameter. These
-fields will then be removed from the config before uploading, but will otherwise
-remain in the config file stored on your local system.
-
-</Infobox>
-
-> #### Example config
->
-> ```ini
-> [training.logger]
-> @loggers = "spacy.WandbLogger.v3"
-> project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
-> log_dataset_dir = "corpus"
-> model_log_interval = 1000
-> ```
-
-| Name                   | Description                                                                                                                                                                                                     |
-| ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~                                                                           |
-| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                                                                                                        |
-| `model_log_interval`   | Steps to wait between logging model checkpoints to W&B dasboard (default: None). ~~Optional[int]~~                                                                                                              |
-| `log_dataset_dir`      | Directory containing dataset to be logged and versioned as W&B artifact (default: None). ~~Optional[str]~~                                                                                                      |
-| `run_name`             | The name of the run. If you don't specify a run_name, the name will be created by wandb library. (default: None ). ~~Optional[str]~~                                                                            |
-| `entity`               | An entity is a username or team name where you're sending runs. If you don't specify an entity, the run will be sent to your default entity, which is usually your username. (default: None). ~~Optional[str]~~ |
-
-<Project id="integrations/wandb">
-
-Get started with tracking your spaCy training runs in Weights & Biases using our
-project template. It trains on the IMDB Movie Review Dataset and includes a
-simple config with the built-in `WandbLogger`, as well as a custom example of
-creating variants of the config for a simple hyperparameter grid search and
-logging the results.
-
-</Project>
 
 ## Readers {#readers}
 
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 6f6cef7c8..e0e787a1d 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -1016,20 +1016,22 @@ commands:
 
 [Weights & Biases](https://www.wandb.com/) is a popular platform for experiment
 tracking. spaCy integrates with it out-of-the-box via the
-[`WandbLogger`](/api/top-level#WandbLogger), which you can add as the
-`[training.logger]` block of your training [config](/usage/training#config). The
-results of each step are then logged in your project, together with the full
-**training config**. This means that _every_ hyperparameter, registered function
-name and argument will be tracked and you'll be able to see the impact it has on
-your results.
+[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger), which
+you can add as the `[training.logger]` block of your training
+[config](/usage/training#config). The results of each step are then logged in
+your project, together with the full **training config**. This means that
+_every_ hyperparameter, registered function name and argument will be tracked
+and you'll be able to see the impact it has on your results.
 
 > #### Example config
 >
 > ```ini
 > [training.logger]
-> @loggers = "spacy.WandbLogger.v2"
+> @loggers = "spacy.WandbLogger.v3"
 > project_name = "monitor_spacy_training"
 > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
+> log_dataset_dir = "corpus"
+> model_log_interval = 1000
 > ```
 
 ![Screenshot: Visualized training results](../images/wandb1.jpg)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 94fdad209..c28b43ea6 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -944,8 +944,8 @@ During training, the results of each step are passed to a logger function. By
 default, these results are written to the console with the
 [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
 for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
-[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
-receives a **dictionary** with the following keys:
+[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger). On each
+step, the logger function receives a **dictionary** with the following keys:
 
 | Key            | Value                                                                                                 |
 | -------------- | ----------------------------------------------------------------------------------------------------- |

From 8fe525beb5b9d0948e52de0f0aeb87df01287e65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jim=20O=E2=80=99Regan?= <jaoregan@tcd.ie>
Date: Thu, 30 Sep 2021 13:18:47 +0100
Subject: [PATCH 016/133] Add an Irish lemmatiser, based on BuNaMo (#9102)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add tréis/théis

* remove previous contents, add demutate/unponc

* fmt off/on wrapping

* type hints

* IrishLemmatizer (sic)

* Use spacy-lookups-data>=1.0.3

* Minor bug fixes, refactoring for IrishLemmatizer

* Fix return type for ADP list lookups
* Fix and refactor lookup table lookups for missing/string/list
* Remove unused variables

* skip lookup of verbal substantives and adjectives; just demutate

* Fix morph checks API details

* Add types and format

* Move helper methods into lemmatizer

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 setup.cfg                                 |   2 +-
 spacy/lang/ga/__init__.py                 |  17 +++
 spacy/lang/ga/irish_morphology_helpers.py |  35 -----
 spacy/lang/ga/lemmatizer.py               | 162 ++++++++++++++++++++++
 spacy/lang/ga/tokenizer_exceptions.py     |   2 +
 5 files changed, 182 insertions(+), 36 deletions(-)
 delete mode 100644 spacy/lang/ga/irish_morphology_helpers.py
 create mode 100644 spacy/lang/ga/lemmatizer.py

diff --git a/setup.cfg b/setup.cfg
index 45fa48ce5..da9944a5e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -69,7 +69,7 @@ console_scripts =
 
 [options.extras_require]
 lookups =
-    spacy_lookups_data>=1.0.2,<1.1.0
+    spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
     spacy_transformers>=1.0.1,<1.1.0
 ray =
diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
index 80131368b..167edf939 100644
--- a/spacy/lang/ga/__init__.py
+++ b/spacy/lang/ga/__init__.py
@@ -1,6 +1,11 @@
+from typing import Optional
+
+from thinc.api import Model
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ...language import Language
+from .lemmatizer import IrishLemmatizer
 
 
 class IrishDefaults(Language.Defaults):
@@ -13,4 +18,16 @@ class Irish(Language):
     Defaults = IrishDefaults
 
 
+@Irish.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+):
+    return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+
+
 __all__ = ["Irish"]
diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py
deleted file mode 100644
index d606da975..000000000
--- a/spacy/lang/ga/irish_morphology_helpers.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# fmt: off
-consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
-broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
-slender_vowels = ["e", "é", "i", "í"]
-vowels = broad_vowels + slender_vowels
-# fmt: on
-
-
-def ends_dentals(word):
-    if word != "" and word[-1] in ["d", "n", "t", "s"]:
-        return True
-    else:
-        return False
-
-
-def devoice(word):
-    if len(word) > 2 and word[-2] == "s" and word[-1] == "d":
-        return word[:-1] + "t"
-    else:
-        return word
-
-
-def ends_with_vowel(word):
-    return word != "" and word[-1] in vowels
-
-
-def starts_with_vowel(word):
-    return word != "" and word[0] in vowels
-
-
-def deduplicate(word):
-    if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
-        return word[:-1]
-    else:
-        return word
diff --git a/spacy/lang/ga/lemmatizer.py b/spacy/lang/ga/lemmatizer.py
new file mode 100644
index 000000000..47aec8fd4
--- /dev/null
+++ b/spacy/lang/ga/lemmatizer.py
@@ -0,0 +1,162 @@
+from typing import List, Dict, Tuple
+
+from ...pipeline import Lemmatizer
+from ...tokens import Token
+
+
+class IrishLemmatizer(Lemmatizer):
+    # This is a lookup-based lemmatiser using data extracted from
+    # BuNaMo (https://github.com/michmech/BuNaMo)
+
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
+        if mode == "pos_lookup":
+            # fmt: off
+            required = [
+                "lemma_lookup_adj", "lemma_lookup_adp",
+                "lemma_lookup_noun", "lemma_lookup_verb"
+            ]
+            # fmt: on
+            return (required, [])
+        else:
+            return super().get_lookups_config(mode)
+
+    def pos_lookup_lemmatize(self, token: Token) -> List[str]:
+        univ_pos = token.pos_
+        string = unponc(token.text)
+        if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
+            return [string.lower()]
+        demutated = demutate(string)
+        secondary = ""
+        if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
+            secondary = string[1:]
+        lookup_pos = univ_pos.lower()
+        if univ_pos == "PROPN":
+            lookup_pos = "noun"
+        if token.has_morph():
+            # TODO: lookup is actually required for the genitive forms, but
+            # this is not in BuNaMo, and would not be of use with IDT.
+            if univ_pos == "NOUN" and (
+                "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
+            ):
+                hpref = "Form=HPref" in token.morph
+                return [demutate(string, hpref).lower()]
+            elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
+                return [demutate(string).lower()]
+        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
+
+        def to_list(value):
+            if value is None:
+                value = []
+            elif not isinstance(value, list):
+                value = [value]
+            return value
+
+        if univ_pos == "ADP":
+            return to_list(lookup_table.get(string, string.lower()))
+        ret = []
+        if univ_pos == "PROPN":
+            ret.extend(to_list(lookup_table.get(demutated)))
+            ret.extend(to_list(lookup_table.get(secondary)))
+        else:
+            ret.extend(to_list(lookup_table.get(demutated.lower())))
+            ret.extend(to_list(lookup_table.get(secondary.lower())))
+        if len(ret) == 0:
+            ret = [string.lower()]
+        return ret
+
+
+def demutate(word: str, is_hpref: bool = False) -> str:
+    UVOWELS = "AÁEÉIÍOÓUÚ"
+    LVOWELS = "aáeéiíoóuú"
+    lc = word.lower()
+    # remove eclipsis
+    if lc.startswith("bhf"):
+        word = word[2:]
+    elif lc.startswith("mb"):
+        word = word[1:]
+    elif lc.startswith("gc"):
+        word = word[1:]
+    elif lc.startswith("nd"):
+        word = word[1:]
+    elif lc.startswith("ng"):
+        word = word[1:]
+    elif lc.startswith("bp"):
+        word = word[1:]
+    elif lc.startswith("dt"):
+        word = word[1:]
+    elif word[0:1] == "n" and word[1:2] in UVOWELS:
+        word = word[1:]
+    elif lc.startswith("n-") and word[2:3] in LVOWELS:
+        word = word[2:]
+    # non-standard eclipsis
+    elif lc.startswith("bh-f"):
+        word = word[3:]
+    elif lc.startswith("m-b"):
+        word = word[2:]
+    elif lc.startswith("g-c"):
+        word = word[2:]
+    elif lc.startswith("n-d"):
+        word = word[2:]
+    elif lc.startswith("n-g"):
+        word = word[2:]
+    elif lc.startswith("b-p"):
+        word = word[2:]
+    elif lc.startswith("d-t"):
+        word = word[2:]
+
+    # t-prothesis
+    elif lc.startswith("ts"):
+        word = word[1:]
+    elif lc.startswith("t-s"):
+        word = word[2:]
+
+    # h-prothesis, if known to be present
+    elif is_hpref and word[0:1] == "h":
+        word = word[1:]
+    # h-prothesis, simple case
+    # words can also begin with 'h', but unlike eclipsis,
+    # a hyphen is not used, so that needs to be handled
+    # elsewhere
+    elif word[0:1] == "h" and word[1:2] in UVOWELS:
+        word = word[1:]
+
+    # lenition
+    # this breaks the previous if, to handle super-non-standard
+    # text where both eclipsis and lenition were used.
+    if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
+        word = word[0:1] + word[2:]
+
+    return word
+
+
+def unponc(word: str) -> str:
+    # fmt: off
+    PONC = {
+        "ḃ": "bh",
+        "ċ": "ch",
+        "ḋ": "dh",
+        "ḟ": "fh",
+        "ġ": "gh",
+        "ṁ": "mh",
+        "ṗ": "ph",
+        "ṡ": "sh",
+        "ṫ": "th",
+        "Ḃ": "BH",
+        "Ċ": "CH",
+        "Ḋ": "DH",
+        "Ḟ": "FH",
+        "Ġ": "GH",
+        "Ṁ": "MH",
+        "Ṗ": "PH",
+        "Ṡ": "SH",
+        "Ṫ": "TH"
+    }
+    # fmt: on
+    buf = []
+    for ch in word:
+        if ch in PONC:
+            buf.append(PONC[ch])
+        else:
+            buf.append(ch)
+    return "".join(buf)
diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index abf49c511..63af65fe9 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -9,6 +9,8 @@ _exc = {
     "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
     "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
     "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
+    "théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
+    "tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
 }
 
 for exc_data in [

From 03fefa37e2170a0b6b52c1bf1c5350c1e1651c18 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 30 Sep 2021 15:35:55 +0200
Subject: [PATCH 017/133] Add overwrite settings for more components (#9050)

* Add overwrite settings for more components

For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.

For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.

* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set

In all cases an unset value will be set by `set_annotations`.

Preserve current overwrite defaults:

* True: morphologizer, entity linker
* False: tagger, sentencizer, senter

* Add backwards compat overwrite settings

* Put empty line back

Removed by accident in last commit

* Set backwards-compatible defaults in __init__

Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.

It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.

* Remove traces

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 spacy/pipeline/entity_linker.py            | 12 +++++--
 spacy/pipeline/morphologizer.pyx           | 42 +++++++++++++++++++---
 spacy/pipeline/sentencizer.pyx             | 18 ++++++----
 spacy/pipeline/senter.pyx                  | 24 +++++++++----
 spacy/pipeline/tagger.pyx                  | 30 ++++++++++++----
 spacy/tests/pipeline/test_morphologizer.py | 36 +++++++++++++++++++
 6 files changed, 135 insertions(+), 27 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index a17eed13c..80e135a30 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -20,6 +20,8 @@ from ..util import SimpleFrozenList, registry
 from .. import util
 from ..scorer import Scorer
 
+# See #9050
+BACKWARD_OVERWRITE = True
 
 default_model_config = """
 [model]
@@ -50,6 +52,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
         "incl_context": True,
         "entity_vector_length": 64,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
+        "overwrite": True,
         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
     },
     default_score_weights={
@@ -69,6 +72,7 @@ def make_entity_linker(
     incl_context: bool,
     entity_vector_length: int,
     get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+    overwrite: bool,
     scorer: Optional[Callable],
 ):
     """Construct an EntityLinker component.
@@ -95,6 +99,7 @@ def make_entity_linker(
         incl_context=incl_context,
         entity_vector_length=entity_vector_length,
         get_candidates=get_candidates,
+        overwrite=overwrite,
         scorer=scorer,
     )
 
@@ -128,6 +133,7 @@ class EntityLinker(TrainablePipe):
         incl_context: bool,
         entity_vector_length: int,
         get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+        overwrite: bool = BACKWARD_OVERWRITE,
         scorer: Optional[Callable] = entity_linker_score,
     ) -> None:
         """Initialize an entity linker.
@@ -156,7 +162,7 @@ class EntityLinker(TrainablePipe):
         self.incl_prior = incl_prior
         self.incl_context = incl_context
         self.get_candidates = get_candidates
-        self.cfg = {}
+        self.cfg = {"overwrite": overwrite}
         self.distance = CosineDistance(normalize=False)
         # how many neighbour sentences to take into account
         # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
@@ -399,12 +405,14 @@ class EntityLinker(TrainablePipe):
         if count_ents != len(kb_ids):
             raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
         i = 0
+        overwrite = self.cfg["overwrite"]
         for doc in docs:
             for ent in doc.ents:
                 kb_id = kb_ids[i]
                 i += 1
                 for token in ent:
-                    token.ent_kb_id_ = kb_id
+                    if token.ent_kb_id == 0 or overwrite:
+                        token.ent_kb_id_ = kb_id
 
     def to_bytes(self, *, exclude=tuple()):
         """Serialize the pipe to a bytestring.
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index f476f02c4..db425b69a 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -19,6 +19,9 @@ from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
 
+# See #9050
+BACKWARD_OVERWRITE = True
+BACKWARD_EXTEND = False
 
 default_model_config = """
 [model]
@@ -49,16 +52,18 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "morphologizer",
     assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
+    default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
     default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
     nlp: Language,
     model: Model,
     name: str,
+    overwrite: bool,
+    extend: bool,
     scorer: Optional[Callable],
 ):
-    return Morphologizer(nlp.vocab, model, name, scorer=scorer)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
 
 
 def morphologizer_score(examples, **kwargs):
@@ -87,6 +92,8 @@ class Morphologizer(Tagger):
         model: Model,
         name: str = "morphologizer",
         *,
+        overwrite: bool = BACKWARD_OVERWRITE,
+        extend: bool = BACKWARD_EXTEND,
         scorer: Optional[Callable] = morphologizer_score,
     ):
         """Initialize a morphologizer.
@@ -109,7 +116,12 @@ class Morphologizer(Tagger):
         # store mappings from morph+POS labels to token-level annotations:
         # 1) labels_morph stores a mapping from morph+POS->morph
         # 2) labels_pos stores a mapping from morph+POS->POS
-        cfg = {"labels_morph": {}, "labels_pos": {}}
+        cfg = {
+            "labels_morph": {},
+            "labels_pos": {},
+            "overwrite": overwrite,
+            "extend": extend,
+        }
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
 
@@ -217,14 +229,34 @@ class Morphologizer(Tagger):
             docs = [docs]
         cdef Doc doc
         cdef Vocab vocab = self.vocab
+        cdef bint overwrite = self.cfg["overwrite"]
+        cdef bint extend = self.cfg["extend"]
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
                 morph = self.labels[tag_id]
-                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
-                doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
+                # set morph
+                if doc.c[j].morph == 0 or overwrite or extend:
+                    if overwrite and extend:
+                        # morphologizer morph overwrites any existing features
+                        # while extending
+                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
+                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
+                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
+                    elif extend:
+                        # existing features are preserved and any new features
+                        # are added
+                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
+                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
+                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
+                    else:
+                        # clobber
+                        doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
+                # set POS
+                if doc.c[j].pos == 0 or overwrite:
+                    doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
 
     def get_loss(self, examples, scores):
         """Find the loss and gradient of loss for the batch of documents and
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 5e64246ad..77f4e8adb 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -10,20 +10,23 @@ from ..language import Language
 from ..scorer import Scorer
 from .. import util
 
+# see #9050
+BACKWARD_OVERWRITE = False
 
 @Language.factory(
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
-    default_config={"punct_chars": None, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
+    default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
 def make_sentencizer(
     nlp: Language,
     name: str,
     punct_chars: Optional[List[str]],
+    overwrite: bool,
     scorer: Optional[Callable],
 ):
-    return Sentencizer(name, punct_chars=punct_chars, scorer=scorer)
+    return Sentencizer(name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer)
 
 
 class Sentencizer(Pipe):
@@ -49,6 +52,7 @@ class Sentencizer(Pipe):
         name="sentencizer",
         *,
         punct_chars=None,
+        overwrite=BACKWARD_OVERWRITE,
         scorer=senter_score,
     ):
         """Initialize the sentencizer.
@@ -65,6 +69,7 @@ class Sentencizer(Pipe):
             self.punct_chars = set(punct_chars)
         else:
             self.punct_chars = set(self.default_punct_chars)
+        self.overwrite = overwrite
         self.scorer = scorer
 
     def __call__(self, doc):
@@ -126,8 +131,7 @@ class Sentencizer(Pipe):
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             for j, tag_id in enumerate(doc_tag_ids):
-                # Don't clobber existing sentence boundaries
-                if doc.c[j].sent_start == 0:
+                if doc.c[j].sent_start == 0 or self.overwrite:
                     if tag_id:
                         doc.c[j].sent_start = 1
                     else:
@@ -140,7 +144,7 @@ class Sentencizer(Pipe):
 
         DOCS: https://spacy.io/api/sentencizer#to_bytes
         """
-        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
+        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
 
     def from_bytes(self, bytes_data, *, exclude=tuple()):
         """Load the sentencizer from a bytestring.
@@ -152,6 +156,7 @@ class Sentencizer(Pipe):
         """
         cfg = srsly.msgpack_loads(bytes_data)
         self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
+        self.overwrite = cfg.get("overwrite", self.overwrite)
         return self
 
     def to_disk(self, path, *, exclude=tuple()):
@@ -161,7 +166,7 @@ class Sentencizer(Pipe):
         """
         path = util.ensure_path(path)
         path = path.with_suffix(".json")
-        srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
+        srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
 
 
     def from_disk(self, path, *, exclude=tuple()):
@@ -173,4 +178,5 @@ class Sentencizer(Pipe):
         path = path.with_suffix(".json")
         cfg = srsly.read_json(path)
         self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
+        self.overwrite = cfg.get("overwrite", self.overwrite)
         return self
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index b1fb2ec37..54ce021af 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -15,6 +15,8 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .. import util
 
+# See #9050
+BACKWARD_OVERWRITE = False
 
 default_model_config = """
 [model]
@@ -36,11 +38,11 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "senter",
     assigns=["token.is_sent_start"],
-    default_config={"model": DEFAULT_SENTER_MODEL, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
+    default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
-def make_senter(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
-    return SentenceRecognizer(nlp.vocab, model, name, scorer=scorer)
+def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
+    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
 
 
 def senter_score(examples, **kwargs):
@@ -62,7 +64,15 @@ class SentenceRecognizer(Tagger):
 
     DOCS: https://spacy.io/api/sentencerecognizer
     """
-    def __init__(self, vocab, model, name="senter", *, scorer=senter_score):
+    def __init__(
+        self,
+        vocab,
+        model,
+        name="senter",
+        *,
+        overwrite=BACKWARD_OVERWRITE,
+        scorer=senter_score,
+    ):
         """Initialize a sentence recognizer.
 
         vocab (Vocab): The shared vocabulary.
@@ -78,7 +88,7 @@ class SentenceRecognizer(Tagger):
         self.model = model
         self.name = name
         self._rehearsal_model = None
-        self.cfg = {}
+        self.cfg = {"overwrite": overwrite}
         self.scorer = scorer
 
     @property
@@ -104,13 +114,13 @@ class SentenceRecognizer(Tagger):
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
+        cdef bint overwrite = self.cfg["overwrite"]
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
-                # Don't clobber existing sentence boundaries
-                if doc.c[j].sent_start == 0:
+                if doc.c[j].sent_start == 0 or overwrite:
                     if tag_id == 1:
                         doc.c[j].sent_start = 1
                     else:
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 16d16b497..a9cbac37a 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -22,6 +22,8 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .. import util
 
+# See #9050
+BACKWARD_OVERWRITE = False
 
 default_model_config = """
 [model]
@@ -43,10 +45,16 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "tagger",
     assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
+    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
     default_score_weights={"tag_acc": 1.0},
 )
-def make_tagger(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
+def make_tagger(
+    nlp: Language,
+    name: str,
+    model: Model,
+    overwrite: bool,
+    scorer: Optional[Callable],
+):
     """Construct a part-of-speech tagger component.
 
     model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
@@ -54,7 +62,7 @@ def make_tagger(nlp: Language, name: str, model: Model, scorer: Optional[Callabl
         in size, and be normalized as probabilities (all scores between 0 and 1,
         with the rows summing to 1).
     """
-    return Tagger(nlp.vocab, model, name, scorer=scorer)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
 
 
 def tagger_score(examples, **kwargs):
@@ -71,7 +79,15 @@ class Tagger(TrainablePipe):
 
     DOCS: https://spacy.io/api/tagger
     """
-    def __init__(self, vocab, model, name="tagger", *, scorer=tagger_score):
+    def __init__(
+        self,
+        vocab,
+        model,
+        name="tagger",
+        *,
+        overwrite=BACKWARD_OVERWRITE,
+        scorer=tagger_score,
+    ):
         """Initialize a part-of-speech tagger.
 
         vocab (Vocab): The shared vocabulary.
@@ -87,7 +103,7 @@ class Tagger(TrainablePipe):
         self.model = model
         self.name = name
         self._rehearsal_model = None
-        cfg = {"labels": []}
+        cfg = {"labels": [], "overwrite": overwrite}
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
 
@@ -149,13 +165,13 @@ class Tagger(TrainablePipe):
             docs = [docs]
         cdef Doc doc
         cdef Vocab vocab = self.vocab
+        cdef bint overwrite = self.cfg["overwrite"]
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
-                # Don't clobber preset POS tags
-                if doc.c[j].tag == 0:
+                if doc.c[j].tag == 0 or overwrite:
                     doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
 
     def update(self, examples, *, drop=0., sgd=None, losses=None):
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 9680d70d2..11d6f0477 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -8,6 +8,7 @@ from spacy.language import Language
 from spacy.tests.util import make_tempdir
 from spacy.morphology import Morphology
 from spacy.attrs import MORPH
+from spacy.tokens import Doc
 
 
 def test_label_types():
@@ -137,6 +138,41 @@ def test_overfitting_IO():
     assert [str(t.morph) for t in doc] == gold_morphs
     assert [t.pos_ for t in doc] == gold_pos_tags
 
+    # Test overwrite+extend settings
+    # (note that "" is unset, "_" is set and empty)
+    morphs = ["Feat=V", "Feat=N", "_"]
+    doc = Doc(nlp.vocab, words=["blue", "ham", "like"], morphs=morphs)
+    orig_morphs = [str(t.morph) for t in doc]
+    orig_pos_tags = [t.pos_ for t in doc]
+    morphologizer = nlp.get_pipe("morphologizer")
+
+    # don't overwrite or extend
+    morphologizer.cfg["overwrite"] = False
+    doc = morphologizer(doc)
+    assert [str(t.morph) for t in doc] == orig_morphs
+    assert [t.pos_ for t in doc] == orig_pos_tags
+
+    # overwrite and extend
+    morphologizer.cfg["overwrite"] = True
+    morphologizer.cfg["extend"] = True
+    doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
+    doc = morphologizer(doc)
+    assert [str(t.morph) for t in doc] == ["Feat=N|That=A|This=A", "Feat=V"]
+
+    # extend without overwriting
+    morphologizer.cfg["overwrite"] = False
+    morphologizer.cfg["extend"] = True
+    doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", "That=B"])
+    doc = morphologizer(doc)
+    assert [str(t.morph) for t in doc] == ["Feat=A|That=A|This=A", "Feat=V|That=B"]
+
+    # overwrite without extending
+    morphologizer.cfg["overwrite"] = True
+    morphologizer.cfg["extend"] = False
+    doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
+    doc = morphologizer(doc)
+    assert [str(t.morph) for t in doc] == ["Feat=N", "Feat=V"]
+
     # Test with unset morph and partial POS
     nlp.remove_pipe("morphologizer")
     nlp.add_pipe("morphologizer")

From 8f2409e5146740bccb94ec641b036cbf48489a28 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 1 Oct 2021 10:37:39 +0000
Subject: [PATCH 018/133] Don't serialize user data in DocBin if not saving it
 (fix #9190) (#9226)

* Don't store user data if told not to (fix #9190)

* Add unit tests for the store_user_data setting
---
 spacy/tests/training/test_training.py | 24 ++++++++++++++++++++++++
 spacy/tokens/_serialize.py            |  3 ++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index cd428be15..48636a4eb 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -524,6 +524,30 @@ def test_roundtrip_docs_to_docbin(doc):
     assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
     assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
 
+def test_docbin_user_data_serialized(doc):
+    doc.user_data["check"] = True
+    nlp = English()
+
+    with make_tempdir() as tmpdir:
+        output_file = tmpdir / "userdata.spacy"
+        DocBin(docs=[doc], store_user_data=True).to_disk(output_file)
+        reloaded_docs = DocBin().from_disk(output_file).get_docs(nlp.vocab)
+        reloaded_doc = list(reloaded_docs)[0]
+
+    assert reloaded_doc.user_data["check"] == True
+
+def test_docbin_user_data_not_serialized(doc):
+    # this isn't serializable, but that shouldn't cause an error
+    doc.user_data["check"] = set()
+    nlp = English()
+
+    with make_tempdir() as tmpdir:
+        output_file = tmpdir / "userdata.spacy"
+        DocBin(docs=[doc], store_user_data=False).to_disk(output_file)
+        reloaded_docs = DocBin().from_disk(output_file).get_docs(nlp.vocab)
+        reloaded_doc = list(reloaded_docs)[0]
+
+    assert "check" not in reloaded_doc.user_data
 
 @pytest.mark.parametrize(
     "tokens_a,tokens_b,expected",
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 2ce329375..5e7d170f8 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -110,7 +110,8 @@ class DocBin:
             self.strings.add(token.ent_kb_id_)
             self.strings.add(token.ent_id_)
         self.cats.append(doc.cats)
-        self.user_data.append(srsly.msgpack_dumps(doc.user_data))
+        if self.store_user_data:
+            self.user_data.append(srsly.msgpack_dumps(doc.user_data))
         self.span_groups.append(doc.spans.to_bytes())
         for key, group in doc.spans.items():
             for span in group:

From 1ee6541ab03ec9ccdeba1befddebbe024af4bd2b Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 1 Oct 2021 17:19:26 +0000
Subject: [PATCH 019/133] Moving Japanese tokenizer extra info to Token.morph
 (#8977)

* Use morph for extra Japanese tokenizer info

Previously Japanese tokenizer info that didn't correspond to Token
fields was put in user data. Since spaCy core should avoid touching user
data, this moves most information to the Token.morph attribute. It also
adds the normalized form, which wasn't exposed before.

The subtokens, which are a list of full tokens, are still added to user
data, except with the default tokenizer granualarity. With the default
tokenizer settings the subtokens are all None, so in this case the user
data is simply not set.

* Update tests

Also adds a new test for norm data.

* Update docs

* Add Japanese morphologizer factory

Set the default to `extend=True` so that the morphologizer does not
clobber the values set by the tokenizer.

* Use the norm_ field for normalized forms

Before this commit, normalized forms were put in the "norm" field in the
morph attributes. I am not sure why I did that instead of using the
token morph, I think I just forgot about it.

* Skip test if sudachipy is not installed

* Fix import

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/lang/ja/__init__.py                     | 64 ++++++++++++++-----
 spacy/tests/lang/ja/test_lemmatization.py     | 14 ++++
 .../lang/ja/test_morphologizer_factory.py     |  9 +++
 spacy/tests/lang/ja/test_tokenizer.py         | 37 +++++------
 website/docs/usage/models.md                  |  4 ++
 5 files changed, 95 insertions(+), 33 deletions(-)
 create mode 100644 spacy/tests/lang/ja/test_morphologizer_factory.py

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 12e65413a..04ff152cf 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -1,7 +1,8 @@
-from typing import Optional, Union, Dict, Any
+from typing import Optional, Union, Dict, Any, Callable
 from pathlib import Path
 import srsly
 from collections import namedtuple
+from thinc.api import Model
 
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
@@ -10,9 +11,11 @@ from .tag_orth_map import TAG_ORTH_MAP
 from .tag_bigram_map import TAG_BIGRAM_MAP
 from ...errors import Errors
 from ...language import Language
+from ...pipeline import Morphologizer
+from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
 from ...scorer import Scorer
 from ...symbols import POS
-from ...tokens import Doc
+from ...tokens import Doc, MorphAnalysis
 from ...training import validate_examples
 from ...util import DummyTokenizer, registry, load_config_from_str
 from ...vocab import Vocab
@@ -41,6 +44,8 @@ class JapaneseTokenizer(DummyTokenizer):
         self.vocab = vocab
         self.split_mode = split_mode
         self.tokenizer = try_sudachi_import(self.split_mode)
+        # if we're using split mode A we don't need subtokens
+        self.need_subtokens = not (split_mode is None or split_mode == "A")
 
     def __reduce__(self):
         return JapaneseTokenizer, (self.vocab, self.split_mode)
@@ -52,8 +57,8 @@ class JapaneseTokenizer(DummyTokenizer):
         dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
 
         # create Doc with tag bi-gram based part-of-speech identification rules
-        words, tags, inflections, lemmas, readings, sub_tokens_list = (
-            zip(*dtokens) if dtokens else [[]] * 6
+        words, tags, inflections, lemmas, norms, readings, sub_tokens_list = (
+            zip(*dtokens) if dtokens else [[]] * 7
         )
         sub_tokens_list = list(sub_tokens_list)
         doc = Doc(self.vocab, words=words, spaces=spaces)
@@ -71,9 +76,14 @@ class JapaneseTokenizer(DummyTokenizer):
                 )
             # if there's no lemma info (it's an unk) just use the surface
             token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
-        doc.user_data["inflections"] = inflections
-        doc.user_data["reading_forms"] = readings
-        doc.user_data["sub_tokens"] = sub_tokens_list
+            morph = {}
+            morph["inflection"] = dtoken.inf
+            token.norm_ = dtoken.norm
+            if dtoken.reading:
+                morph["reading"] = dtoken.reading
+            token.morph = MorphAnalysis(self.vocab, morph)
+        if self.need_subtokens:
+            doc.user_data["sub_tokens"] = sub_tokens_list
         return doc
 
     def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
@@ -86,7 +96,8 @@ class JapaneseTokenizer(DummyTokenizer):
                 "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
                 ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
                 token.dictionary_form(),  # lemma
-                token.reading_form(),  # user_data['reading_forms']
+                token.normalized_form(),
+                token.reading_form(),
                 sub_tokens_list[idx]
                 if sub_tokens_list
                 else None,  # user_data['sub_tokens']
@@ -108,9 +119,8 @@ class JapaneseTokenizer(DummyTokenizer):
         ]
 
     def _get_sub_tokens(self, sudachipy_tokens):
-        if (
-            self.split_mode is None or self.split_mode == "A"
-        ):  # do nothing for default split mode
+        # do nothing for default split mode
+        if not self.need_subtokens:
             return None
 
         sub_tokens_list = []  # list of (list of list of DetailedToken | None)
@@ -179,9 +189,33 @@ class Japanese(Language):
     Defaults = JapaneseDefaults
 
 
+@Japanese.factory(
+    "morphologizer",
+    assigns=["token.morph", "token.pos"],
+    default_config={
+        "model": DEFAULT_MORPH_MODEL,
+        "overwrite": True,
+        "extend": True,
+        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+    },
+    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
+)
+def make_morphologizer(
+    nlp: Language,
+    model: Model,
+    name: str,
+    overwrite: bool,
+    extend: bool,
+    scorer: Optional[Callable],
+):
+    return Morphologizer(
+        nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer
+    )
+
+
 # Hold the attributes we need with convenient names
 DetailedToken = namedtuple(
-    "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
+    "DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"]
 )
 
 
@@ -257,7 +291,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
         return text_dtokens, text_spaces
     elif len([word for word in words if not word.isspace()]) == 0:
         assert text.isspace()
-        text_dtokens = [DetailedToken(text, gap_tag, "", text, None, None)]
+        text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)]
         text_spaces = [False]
         return text_dtokens, text_spaces
 
@@ -274,7 +308,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
         # space token
         if word_start > 0:
             w = text[text_pos : text_pos + word_start]
-            text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
+            text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
             text_spaces.append(False)
             text_pos += word_start
 
@@ -290,7 +324,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
     # trailing space token
     if text_pos < len(text):
         w = text[text_pos:]
-        text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
+        text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
         text_spaces.append(False)
 
     return text_dtokens, text_spaces
diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py
index 6041611e6..21879a569 100644
--- a/spacy/tests/lang/ja/test_lemmatization.py
+++ b/spacy/tests/lang/ja/test_lemmatization.py
@@ -8,3 +8,17 @@ import pytest
 def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
     test_lemma = ja_tokenizer(word)[0].lemma_
     assert test_lemma == lemma
+
+
+@pytest.mark.parametrize(
+    "word,norm",
+    [
+        ("SUMMER", "サマー"),
+        ("食べ物", "食べ物"),
+        ("綜合", "総合"),
+        ("コンピュータ", "コンピューター"),
+    ],
+)
+def test_ja_lemmatizer_norm(ja_tokenizer, word, norm):
+    test_norm = ja_tokenizer(word)[0].norm_
+    assert test_norm == norm
diff --git a/spacy/tests/lang/ja/test_morphologizer_factory.py b/spacy/tests/lang/ja/test_morphologizer_factory.py
new file mode 100644
index 000000000..a4e038d01
--- /dev/null
+++ b/spacy/tests/lang/ja/test_morphologizer_factory.py
@@ -0,0 +1,9 @@
+import pytest
+from spacy.lang.ja import Japanese
+
+
+def test_ja_morphologizer_factory():
+    pytest.importorskip("sudachipy")
+    nlp = Japanese()
+    morphologizer = nlp.add_pipe("morphologizer")
+    assert morphologizer.cfg["extend"] is True
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index c8c85d655..7b322293a 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -34,22 +34,22 @@ SENTENCE_TESTS = [
 ]
 
 tokens1 = [
-    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
-    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
+    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
+    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
 ]
 tokens2 = [
-    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
-    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
-    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
-    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
+    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
+    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
+    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
+    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
 ]
 tokens3 = [
-    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
-    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
-    DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", reading="イインカイ", sub_tokens=None),
+    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
+    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
+    DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
 ]
 SUB_TOKEN_TESTS = [
-    ("選挙管理委員会", [None, None, None, None], [None, None, [tokens1]], [[tokens2, tokens3]])
+    ("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
 ]
 # fmt: on
 
@@ -111,18 +111,16 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
     assert len(nlp_c(text)) == len_c
 
 
-@pytest.mark.parametrize(
-    "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
-)
+@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
 def test_ja_tokenizer_sub_tokens(
-    ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
+    ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
 ):
     nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
     nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
     nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
 
-    assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
-    assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
+    assert ja_tokenizer(text).user_data.get("sub_tokens") is None
+    assert nlp_a(text).user_data.get("sub_tokens") is None
     assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
     assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
 
@@ -140,8 +138,11 @@ def test_ja_tokenizer_sub_tokens(
 def test_ja_tokenizer_inflections_reading_forms(
     ja_tokenizer, text, inflections, reading_forms
 ):
-    assert ja_tokenizer(text).user_data["inflections"] == inflections
-    assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
+    tokens = ja_tokenizer(text)
+    test_inflections = [",".join(tt.morph.get("inflection")) for tt in tokens]
+    assert test_inflections == list(inflections)
+    test_readings = [tt.morph.get("reading")[0] for tt in tokens]
+    assert test_readings == list(reading_forms)
 
 
 def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index d1c9a0a81..3b79c4d0d 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -247,6 +247,10 @@ config can be used to configure the split mode to `A`, `B` or `C`.
 split_mode = "A"
 ```
 
+Extra information, such as reading, inflection form, and the SudachiPy
+normalized form, is available in `Token.morph`. For `B` or `C` split modes,
+subtokens are stored in `Doc.user_data["sub_tokens"]`.
+
 <Infobox variant="warning">
 
 If you run into errors related to `sudachipy`, which is currently under active

From 53b5f245eda6dc3e88b84cba15c2a3f4df3cf4c2 Mon Sep 17 00:00:00 2001
From: "Elia Robyn Lake (Robyn Speer)" <gh@arborelia.net>
Date: Tue, 5 Oct 2021 03:52:22 -0400
Subject: [PATCH 020/133] Allow IETF language codes, aliases, and close matches
 (#9342)

* use language-matching to allow language code aliases

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* link to "IETF language tags" in docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Make requirements consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* change "two-letter language ID" to "IETF language tag" in language docs

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use langcodes 3.2 and handle language-tag errors better

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* all unknown language codes are ImportErrors

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
---
 requirements.txt              |  1 +
 setup.cfg                     |  1 +
 spacy/errors.py               |  2 +-
 spacy/language.py             |  2 +-
 spacy/tests/test_language.py  | 51 +++++++++++++++++++++++-
 spacy/tests/test_misc.py      |  6 +++
 spacy/util.py                 | 75 ++++++++++++++++++++++++++++++++++-
 website/docs/api/cli.md       |  2 +-
 website/docs/api/language.md  |  2 +-
 website/docs/api/top-level.md |  2 +-
 10 files changed, 136 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 6f9addbe9..85de453b7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,3 +30,4 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<3.10.0
 hypothesis>=3.27.0,<7.0.0
+langcodes>=3.2.0,<4.0.0
diff --git a/setup.cfg b/setup.cfg
index da9944a5e..4313612d4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -62,6 +62,7 @@ install_requires =
     setuptools
     packaging>=20.0
     typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
+    langcodes>=3.2.0,<4.0.0
 
 [options.entry_points]
 console_scripts =
diff --git a/spacy/errors.py b/spacy/errors.py
index 064f33f31..120828fd6 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -282,7 +282,7 @@ class Errors:
             "you forget to call the `set_extension` method?")
     E047 = ("Can't assign a value to unregistered extension attribute "
             "'{name}'. Did you forget to call the `set_extension` method?")
-    E048 = ("Can't import language {lang} from spacy.lang: {err}")
+    E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
     E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
             "package or a valid path to a data directory.")
     E052 = ("Can't find model directory: {path}")
diff --git a/spacy/language.py b/spacy/language.py
index 512306796..d87f86bd3 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -105,7 +105,7 @@ class Language:
 
     Defaults (class): Settings, data and factory methods for creating the `nlp`
         object and processing pipeline.
-    lang (str): Two-letter language ID, i.e. ISO code.
+    lang (str): IETF language code, such as 'en'.
 
     DOCS: https://spacy.io/api/language
     """
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index e3c25fece..7a9021af0 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -8,7 +8,7 @@ from spacy.vocab import Vocab
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.lang.de import German
-from spacy.util import registry, ignore_error, raise_error
+from spacy.util import registry, ignore_error, raise_error, find_matching_language
 import spacy
 from thinc.api import NumpyOps, get_current_ops
 
@@ -502,6 +502,55 @@ def test_spacy_blank():
     assert nlp.meta["name"] == "my_custom_model"
 
 
+@pytest.mark.parametrize(
+    "lang,target",
+    [
+        ('en', 'en'),
+        ('fra', 'fr'),
+        ('fre', 'fr'),
+        ('iw', 'he'),
+        ('mo', 'ro'),
+        ('mul', 'xx'),
+        ('no', 'nb'),
+        ('pt-BR', 'pt'),
+        ('xx', 'xx'),
+        ('zh-Hans', 'zh'),
+        ('zh-Hant', None),
+        ('zxx', None)
+    ]
+)
+def test_language_matching(lang, target):
+    """
+    Test that we can look up languages by equivalent or nearly-equivalent
+    language codes.
+    """
+    assert find_matching_language(lang) == target
+
+
+@pytest.mark.parametrize(
+    "lang,target",
+    [
+        ('en', 'en'),
+        ('fra', 'fr'),
+        ('fre', 'fr'),
+        ('iw', 'he'),
+        ('mo', 'ro'),
+        ('mul', 'xx'),
+        ('no', 'nb'),
+        ('pt-BR', 'pt'),
+        ('xx', 'xx'),
+        ('zh-Hans', 'zh'),
+    ]
+)
+def test_blank_languages(lang, target):
+    """
+    Test that we can get spacy.blank in various languages, including codes
+    that are defined to be equivalent or that match by CLDR language matching.
+    """
+    nlp = spacy.blank(lang)
+    assert nlp.lang == target
+
+
 @pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
 def test_language_init_invalid_vocab(value):
     err_fragment = "invalid value"
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 45cbdf45b..f17d5e62e 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -139,6 +139,12 @@ def test_load_model_blank_shortcut():
     nlp = util.load_model("blank:en")
     assert nlp.lang == "en"
     assert nlp.pipeline == []
+
+    # ImportError for loading an unsupported language
+    with pytest.raises(ImportError):
+        util.load_model("blank:zxx")
+
+    # ImportError for requesting an invalid language code that isn't registered
     with pytest.raises(ImportError):
         util.load_model("blank:fjsfijsdof")
 
diff --git a/spacy/util.py b/spacy/util.py
index e747d5fbc..fc1c0e76d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -16,6 +16,7 @@ import numpy
 import srsly
 import catalogue
 from catalogue import RegistryError, Registry
+import langcodes
 import sys
 import warnings
 from packaging.specifiers import SpecifierSet, InvalidSpecifier
@@ -28,6 +29,7 @@ import tempfile
 import shutil
 import shlex
 import inspect
+import pkgutil
 import logging
 
 try:
@@ -256,20 +258,89 @@ def lang_class_is_loaded(lang: str) -> bool:
     return lang in registry.languages
 
 
+def find_matching_language(lang: str) -> Optional[str]:
+    """
+    Given an IETF language code, find a supported spaCy language that is a
+    close match for it (according to Unicode CLDR language-matching rules).
+    This allows for language aliases, ISO 639-2 codes, more detailed language
+    tags, and close matches.
+
+    Returns the language code if a matching language is available, or None
+    if there is no matching language.
+
+    >>> find_matching_language('en')
+    'en'
+    >>> find_matching_language('pt-BR')  # Brazilian Portuguese
+    'pt'
+    >>> find_matching_language('fra')  # an ISO 639-2 code for French
+    'fr'
+    >>> find_matching_language('iw')  # obsolete alias for Hebrew
+    'he'
+    >>> find_matching_language('no')  # Norwegian
+    'nb'
+    >>> find_matching_language('mo')  # old code for ro-MD
+    'ro'
+    >>> find_matching_language('zh-Hans')  # Simplified Chinese
+    'zh'
+    >>> find_matching_language('zxx')
+    None
+    """
+    import spacy.lang  # noqa: F401
+    if lang == 'xx':
+        return 'xx'
+
+    # Find out which language modules we have
+    possible_languages = []
+    for modinfo in pkgutil.iter_modules(spacy.lang.__path__):
+        code = modinfo.name
+        if code == 'xx':
+            # Temporarily make 'xx' into a valid language code
+            possible_languages.append('mul')
+        elif langcodes.tag_is_valid(code):
+            possible_languages.append(code)
+
+    # Distances from 1-9 allow near misses like Bosnian -> Croatian and
+    # Norwegian -> Norwegian Bokmål. A distance of 10 would include several
+    # more possibilities, like variants of Chinese like 'wuu', but text that
+    # is labeled that way is probably trying to be distinct from 'zh' and
+    # shouldn't automatically match.
+    match = langcodes.closest_supported_match(
+        lang, possible_languages, max_distance=9
+    )
+    if match == 'mul':
+        # Convert 'mul' back to spaCy's 'xx'
+        return 'xx'
+    else:
+        return match
+
+
 def get_lang_class(lang: str) -> "Language":
     """Import and load a Language class.
 
-    lang (str): Two-letter language code, e.g. 'en'.
+    lang (str): IETF language code, such as 'en'.
     RETURNS (Language): Language class.
     """
     # Check if language is registered / entry point is available
     if lang in registry.languages:
         return registry.languages.get(lang)
     else:
+        # Find the language in the spacy.lang subpackage
         try:
             module = importlib.import_module(f".lang.{lang}", "spacy")
         except ImportError as err:
-            raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
+            # Find a matching language. For example, if the language 'no' is
+            # requested, we can use language-matching to load `spacy.lang.nb`.
+            try:
+                match = find_matching_language(lang)
+            except langcodes.tag_parser.LanguageTagError:
+                # proceed to raising an import error
+                match = None
+
+            if match:
+                lang = match
+                module = importlib.import_module(f".lang.{lang}", "spacy")
+            else:
+                raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
         set_lang_class(lang, getattr(module, module.__all__[0]))
     return registry.languages.get(lang)
 
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 10ab2083e..aadeebd77 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -203,7 +203,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 
 | Name               | Description                                                                                                                                                                                                                                                         |
 | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang`             | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~                                                                                                                                                |
+| `lang`             | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~                                                                                                                                 |
 | `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 0aa33b281..4cf063fcc 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -1039,7 +1039,7 @@ available to the loaded object.
 | Name             | Description                                                                                                                                                                       |
 | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `Defaults`       | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~                                                                            |
-| `lang`           | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~                                                                           |
+| `lang`           | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~                                                                  |
 | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
 
 ## Defaults {#defaults}
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 48c16e559..b48cd47f3 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -83,7 +83,7 @@ Create a blank pipeline of a given language class. This function is the twin of
 
 | Name                                | Description                                                                                                                                                        |
 | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `name`                              | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~                                                           |
+| `name`                              | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~                                |
 | _keyword-only_                      |                                                                                                                                                                    |
 | `vocab`                             | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                              |
 | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |

From f975690cc9f0176db2a6210b7ff64c16c44771b8 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 7 Oct 2021 17:09:38 +0900
Subject: [PATCH 021/133] Use hyphen to join parts of inflection in JA
 tokenizer

---
 spacy/lang/ja/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 04ff152cf..0f25b1fc1 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -94,7 +94,7 @@ class JapaneseTokenizer(DummyTokenizer):
             DetailedToken(
                 token.surface(),  # orth
                 "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
-                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
+                "-".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
                 token.dictionary_form(),  # lemma
                 token.normalized_form(),
                 token.reading_form(),

From 227f98081b86b315907ec672f02b0a2334dd10e8 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 7 Oct 2021 17:14:05 +0900
Subject: [PATCH 022/133] Use a pipe for separating Japanese inflections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Inflection values look like this pipe separated:

    五段-ラ行|連用形-促音便

So using a hyphen erases the original fields.
---
 spacy/lang/ja/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 0f25b1fc1..0695415be 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -94,7 +94,7 @@ class JapaneseTokenizer(DummyTokenizer):
             DetailedToken(
                 token.surface(),  # orth
                 "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
-                "-".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
+                "|".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
                 token.dictionary_form(),  # lemma
                 token.normalized_form(),
                 token.reading_form(),

From c4e3b7a5db7de8336e1e9edb424a8a6eb23940e9 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 7 Oct 2021 17:28:15 +0900
Subject: [PATCH 023/133] Change JA inflection separator to semicolon

Hyphen is unsuitable because of interactions with the JA data fields,
but pipe is also unsuitable because it has a different meaning in UD
data, so it's better to use something that has no significance in either
case. So this uses semicolon.
---
 spacy/lang/ja/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 0695415be..972e21a3a 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -94,7 +94,7 @@ class JapaneseTokenizer(DummyTokenizer):
             DetailedToken(
                 token.surface(),  # orth
                 "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
-                "|".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
+                ";".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
                 token.dictionary_form(),  # lemma
                 token.normalized_form(),
                 token.reading_form(),

From 113d53ab6c7ec902a002d8508a28c8aba3faf7a0 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 7 Oct 2021 11:42:18 +0000
Subject: [PATCH 024/133] Fix tests for changes to inflection structure (#9390)

---
 spacy/tests/lang/ja/test_tokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index 7b322293a..e61975551 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -130,7 +130,7 @@ def test_ja_tokenizer_sub_tokens(
     [
         (
             "取ってつけた",
-            ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
+            ("五段-ラ行;連用形-促音便", "", "下一段-カ行;連用形-一般", "助動詞-タ;終止形-一般"),
             ("トッ", "テ", "ツケ", "タ"),
         ),
     ],
@@ -139,7 +139,7 @@ def test_ja_tokenizer_inflections_reading_forms(
     ja_tokenizer, text, inflections, reading_forms
 ):
     tokens = ja_tokenizer(text)
-    test_inflections = [",".join(tt.morph.get("inflection")) for tt in tokens]
+    test_inflections = [tt.morph.get("inflection")[0] for tt in tokens]
     assert test_inflections == list(inflections)
     test_readings = [tt.morph.get("reading")[0] for tt in tokens]
     assert test_readings == list(reading_forms)

From ae1b3e960bd20a2c97ba3d61967d4b4ab85caec7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 11 Oct 2021 10:35:07 +0200
Subject: [PATCH 025/133] Update overwrite and scorer in API docs (#9384)

* Update overwrite and scorer in API docs

* Rephrase morphologizer extend + example
---
 website/docs/api/entitylinker.md       | 47 ++++++++++++++------------
 website/docs/api/morphologizer.md      | 38 +++++++++++++++------
 website/docs/api/sentencerecognizer.md | 23 +++++++------
 website/docs/api/sentencizer.md        | 19 ++++++-----
 website/docs/api/tagger.md             | 23 +++++++------
 5 files changed, 90 insertions(+), 60 deletions(-)

diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 3c83ba484..3d3372679 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -51,16 +51,17 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-| Setting                | Description                                                                                                                                                                                                                                                              |
-| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `labels_discard`       | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                           |
-| `n_sents`              | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                        |
-| `incl_prior`           | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                     |
-| `incl_context`         | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                   |
-| `model`                | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
-| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                            |
-| `get_candidates`       | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
-| `scorer`               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                  |
+| Setting                                  | Description                                                                                                                                                                                                                                                              |
+| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `labels_discard`                         | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                           |
+| `n_sents`                                | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                        |
+| `incl_prior`                             | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                     |
+| `incl_context`                           | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                   |
+| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
+| `entity_vector_length`                   | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                            |
+| `get_candidates`                         | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                 |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                  |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@@ -93,18 +94,20 @@ custom knowledge base, you should either call
 [`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
 [`initialize`](/api/entitylinker#initialize) call.
 
-| Name                   | Description                                                                                                                      |
-| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`                | The shared vocabulary. ~~Vocab~~                                                                                                 |
-| `model`                | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~                                        |
-| `name`                 | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
-| _keyword-only_         |                                                                                                                                  |
-| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~                                                                                      |
-| `get_candidates`       | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
-| `labels_discard`       | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~                                                   |
-| `n_sents`              | The number of neighbouring sentences to take into account. ~~int~~                                                               |
-| `incl_prior`           | Whether or not to include prior probabilities from the KB in the model. ~~bool~~                                                 |
-| `incl_context`         | Whether or not to include the local context in the model. ~~bool~~                                                               |
+| Name                                     | Description                                                                                                                      |
+| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                  | The shared vocabulary. ~~Vocab~~                                                                                                 |
+| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~                                        |
+| `name`                                   | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
+| _keyword-only_                           |                                                                                                                                  |
+| `entity_vector_length`                   | Size of encoding vectors in the KB. ~~int~~                                                                                      |
+| `get_candidates`                         | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| `labels_discard`                         | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~                                                   |
+| `n_sents`                                | The number of neighbouring sentences to take into account. ~~int~~                                                               |
+| `incl_prior`                             | Whether or not to include prior probabilities from the KB in the model. ~~bool~~                                                 |
+| `incl_context`                           | Whether or not to include the local context in the model. ~~bool~~                                                               |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                         |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                          |
 
 ## EntityLinker.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index 054d48a2c..434c56833 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -42,9 +42,12 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("morphologizer", config=config)
 > ```
 
-| Setting | Description                                                                                             |
-| ------- | ------------------------------------------------------------------------------------------------------- |
-| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| Setting                                  | Description                                                                                                                                                                                                                                                            |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`                                  | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
+| `extend` <Tag variant="new">3.2</Tag>    | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx
@@ -56,6 +59,19 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
+The `overwrite` and `extend` settings determine how existing annotation is
+handled (with the example for existing annotation `A=B|C=D` + predicted
+annotation `C=E|X=Y`):
+
+- `overwrite=True, extend=True`: overwrite values of existing features, add any
+  new features (`A=B|C=D` + `C=E|X=Y` &rarr; `A=B|C=E|X=Y`)
+- `overwrite=True, extend=False`: overwrite completely, removing any existing
+  features (`A=B|C=D` + `C=E|X=Y` &rarr; `C=E|X=Y`)
+- `overwrite=False, extend=True`: keep values of existing features, add any new
+  features (`A=B|C=D` + `C=E|X=Y` &rarr; `A=B|C=D|X=Y`)
+- `overwrite=False, extend=False`: do not modify the existing annotation if set
+  (`A=B|C=D` + `C=E|X=Y` &rarr; `A=B|C=D`)
+
 > #### Example
 >
 > ```python
@@ -71,13 +87,15 @@ shortcut for this and instantiate the component using its string name and
 > morphologizer = Morphologizer(nlp.vocab, model)
 > ```
 
-| Name           | Description                                                                                                                                                                                                                                                            |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                       |
-| `model`        | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                   |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                    |
-| _keyword-only_ |                                                                                                                                                                                                                                                                        |
-| `scorer`       | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
+| Name                                     | Description                                                                                                                                                                                                                                                            |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                  | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                       |
+| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                   |
+| `name`                                   | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                    |
+| _keyword-only_                           |                                                                                                                                                                                                                                                                        |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
+| `extend` <Tag variant="new">3.2</Tag>    | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 
 ## Morphologizer.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md
index 7b6ef7d34..29bf10393 100644
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@@ -39,9 +39,11 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("senter", config=config)
 > ```
 
-| Setting | Description                                                                                                                                                           |
-| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| Setting                                  | Description                                                                                                                                                           |
+| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                             |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~                                   |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/senter.pyx
@@ -70,13 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
-| Name           | Description                                                                                                                         |
-| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                    |
-| `model`        | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                 |
-| _keyword-only_ |                                                                                                                                     |
-| `scorer`       | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ |
+| Name                                     | Description                                                                                                                         |
+| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                  | The shared vocabulary. ~~Vocab~~                                                                                                    |
+| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                |
+| `name`                                   | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                 |
+| _keyword-only_                           |                                                                                                                                     |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                           |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ |
 
 ## SentenceRecognizer.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md
index 314ba62ad..b75c7a2f1 100644
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@@ -37,9 +37,11 @@ how the component should be configured. You can override its settings via the
 > nlp.add_pipe("sentencizer", config=config)
 > ```
 
-| Setting       | Description                                                                                                                                            |
-| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
+| Setting                                  | Description                                                                                                                                            |
+| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `punct_chars`                            | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                              |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~                     |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/sentencizer.pyx
@@ -60,11 +62,12 @@ Initialize the sentencizer.
 > sentencizer = Sentencizer()
 > ```
 
-| Name           | Description                                                                                                                        |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
-| _keyword-only_ |                                                                                                                                    |
-| `punct_chars`  | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~            |
-| `scorer`       | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~ |
+| Name                                     | Description                                                                                                                        |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+| _keyword-only_                           |                                                                                                                                    |
+| `punct_chars`                            | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~            |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                          |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~ |
 
 ```python
 ### punct_chars defaults
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index 50d444658..93b6bc88b 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -40,9 +40,11 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("tagger", config=config)
 > ```
 
-| Setting | Description                                                                                                                                                                                                                                                                                            |
-| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| Setting                                  | Description                                                                                                                                                                                                                                                                                            |
+| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                  | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                              |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                                                                            |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/tagger.pyx
@@ -69,13 +71,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
-| Name           | Description                                                                                                                                                                                                                                           |
-| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                      |
-| `model`        | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                   |
-| _keyword-only_ |                                                                                                                                                                                                                                                       |
-| `scorer`       | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                           |
+| Name                                     | Description                                                                                                                                                                                                                                           |
+| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                  | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                      |
+| `model`                                  | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `name`                                   | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                   |
+| _keyword-only_                           |                                                                                                                                                                                                                                                       |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                             |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                           |
 
 ## Tagger.\_\_call\_\_ {#call tag="method"}
 

From a3b7519aba438df736a269e44adeaef1d165d7ee Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 14 Oct 2021 07:21:36 +0000
Subject: [PATCH 026/133] Fix JA Morph Values (#9449)

* Don't set empty / weird values in morph

* Update tests to handy empty morph values

* Fix everything

* Replace potentially problematic characters

* Fix test
---
 spacy/lang/ja/__init__.py             |  9 +++++++--
 spacy/tests/lang/ja/test_tokenizer.py | 13 +++++++++----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 972e21a3a..e701ecfdf 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -3,6 +3,7 @@ from pathlib import Path
 import srsly
 from collections import namedtuple
 from thinc.api import Model
+import re
 
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
@@ -77,10 +78,14 @@ class JapaneseTokenizer(DummyTokenizer):
             # if there's no lemma info (it's an unk) just use the surface
             token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
             morph = {}
-            morph["inflection"] = dtoken.inf
+            if dtoken.inf:
+                # it's normal for this to be empty for non-inflecting types
+                morph["inflection"] = dtoken.inf
             token.norm_ = dtoken.norm
             if dtoken.reading:
-                morph["reading"] = dtoken.reading
+                # punctuation is its own reading, but we don't want values like
+                # "=" here
+                morph["reading"] = re.sub("[=|]", "_", dtoken.reading)
             token.morph = MorphAnalysis(self.vocab, morph)
         if self.need_subtokens:
             doc.user_data["sub_tokens"] = sub_tokens_list
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index e61975551..eb170061a 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -130,8 +130,13 @@ def test_ja_tokenizer_sub_tokens(
     [
         (
             "取ってつけた",
-            ("五段-ラ行;連用形-促音便", "", "下一段-カ行;連用形-一般", "助動詞-タ;終止形-一般"),
-            ("トッ", "テ", "ツケ", "タ"),
+            (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
+            (["トッ"], ["テ"], ["ツケ"], ["タ"]),
+        ),
+        (
+            "2=3",
+            ([], [], []),
+            (["ニ"], ["_"], ["サン"])
         ),
     ],
 )
@@ -139,9 +144,9 @@ def test_ja_tokenizer_inflections_reading_forms(
     ja_tokenizer, text, inflections, reading_forms
 ):
     tokens = ja_tokenizer(text)
-    test_inflections = [tt.morph.get("inflection")[0] for tt in tokens]
+    test_inflections = [tt.morph.get("inflection") for tt in tokens]
     assert test_inflections == list(inflections)
-    test_readings = [tt.morph.get("reading")[0] for tt in tokens]
+    test_readings = [tt.morph.get("reading") for tt in tokens]
     assert test_readings == list(reading_forms)
 
 

From 8a018f5207d81e15701798243c65194fd1f80aa6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 14 Oct 2021 10:18:11 +0200
Subject: [PATCH 027/133] Set version to v3.2.0.dev0

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 3137be806..f6043353e 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.1.3"
+__version__ = "3.2.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From bd6433bbab66de70086e6df9130aab25cca4fe7e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 14 Oct 2021 10:30:57 +0200
Subject: [PATCH 028/133] Temporarily use v3.1.0 models in CI

---
 .github/azure-steps.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 543804b9f..c47e9799e 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -61,8 +61,11 @@ steps:
     condition: eq(${{ parameters.gpu }}, true)
 
   - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
+      #python -m spacy download ca_core_news_sm
+      #python -m spacy download ca_core_news_md
+      # temporarily install the v3.1.0 models
+      pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.1.0/ca_core_news_sm-3.1.0-py3-none-any.whl
+      pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.1.0/ca_core_news_md-3.1.0-py3-none-any.whl
       python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
     displayName: 'Test download CLI'
     condition: eq(variables['python_version'], '3.8')

From 8db574e0b55d00196be50eebc56775a2854c8795 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 14 Oct 2021 13:27:39 +0200
Subject: [PATCH 029/133] Temporarily ignore W095 in assemble CLI CI test
 (#9460)

* Temporarily ignore W095 in assemble CLI CI test

* Adjust PR CI includes
---
 .github/azure-steps.yml | 3 ++-
 azure-pipelines.yml     | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index c47e9799e..823509888 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -94,7 +94,8 @@ steps:
 
   - script: |
       python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      # temporarily ignore W095
+      PYTHONWARNINGS="error,ignore:[W095]:UserWarning,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
     displayName: 'Test assemble CLI'
     condition: eq(variables['python_version'], '3.8')
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 844946845..245407189 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -13,10 +13,13 @@ trigger:
       - "*.md"
 pr:
   paths:
-    exclude:
-      - "website/*"
-      - "*.md"
     include:
+      - "*.cfg"
+      - "*.py"
+      - "*.toml"
+      - "*.yml"
+      - ".github/azure-steps.yml"
+      - "spacy/*"
       - "website/meta/universe.json"
 
 jobs:

From 9a824255d3423e6eab8998e9cea54ddc1fd8ac67 Mon Sep 17 00:00:00 2001
From: Aviora <phamson11999977@gmail.com>
Date: Fri, 15 Oct 2021 00:15:51 +0700
Subject: [PATCH 030/133] Add examples and num_words for Vietnamese (#9412)

* add examples and num_words

* add contributor agreement

* Update spacy/lang/vi/examples.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* consistent format

add empty line at the end of file

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 .github/contributors/avi197.md | 106 +++++++++++++++++++++++++++++++++
 spacy/lang/vi/examples.py      |  18 ++++++
 spacy/lang/vi/lex_attrs.py     |   3 +
 3 files changed, 127 insertions(+)
 create mode 100644 .github/contributors/avi197.md
 create mode 100644 spacy/lang/vi/examples.py

diff --git a/.github/contributors/avi197.md b/.github/contributors/avi197.md
new file mode 100644
index 000000000..903d7db4c
--- /dev/null
+++ b/.github/contributors/avi197.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Son Pham             |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 09/10/2021           |
+| GitHub username                | Avi197               |
+| Website (optional)             |                      |
diff --git a/spacy/lang/vi/examples.py b/spacy/lang/vi/examples.py
new file mode 100644
index 000000000..86d0b50b8
--- /dev/null
+++ b/spacy/lang/vi/examples.py
@@ -0,0 +1,18 @@
+
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.vi.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Đây là đâu, tôi là ai?",
+    "Căn phòng có nhiều cửa sổ nên nó khá sáng",
+    "Đại dịch COVID vừa qua đã gây ảnh hưởng rất lớn tới nhiều doanh nghiệp lớn nhỏ.",
+    "Thành phố Hồ Chí Minh đã bị ảnh hưởng nặng nề trong thời gian vừa qua.",
+    "Ông bạn đang ở đâu thế?",
+    "Ai là người giải phóng đất nước Việt Nam khỏi ách đô hộ?",
+    "Vị tướng nào là người đã làm nên chiến thắng lịch sử Điện Biên Phủ?",
+    "Làm việc nhiều chán quá, đi chơi đâu đi?",
+]
diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py
index b3dbf2192..33a3745cc 100644
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@@ -9,11 +9,14 @@ _num_words = [
     "bốn",
     "năm",
     "sáu",
+    "bảy",
     "bẩy",
     "tám",
     "chín",
     "mười",
+    "chục",
     "trăm",
+    "nghìn",
     "tỷ",
 ]
 

From 271e8e785648f6e5a4c07b35d4f239c259bfe9e0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 15 Oct 2021 14:28:02 +0200
Subject: [PATCH 031/133] Skip compat table tests for prerelease versions
 (#9476)

---
 spacy/tests/test_cli.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 72bbe04e5..3243d426b 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,5 +1,6 @@
 import pytest
 from click import NoSuchOption
+from packaging.specifiers import SpecifierSet
 from spacy.training import docs_to_json, offsets_to_biluo_tags
 from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
@@ -492,18 +493,24 @@ def test_string_to_list_intify(value):
 
 
 def test_download_compatibility():
-    model_name = "en_core_web_sm"
-    compatibility = get_compatibility()
-    version = get_version(model_name, compatibility)
-    assert get_minor_version(about.__version__) == get_minor_version(version)
+    spec = SpecifierSet("==" + about.__version__)
+    spec.prereleases = False
+    if about.__version__ in spec:
+        model_name = "en_core_web_sm"
+        compatibility = get_compatibility()
+        version = get_version(model_name, compatibility)
+        assert get_minor_version(about.__version__) == get_minor_version(version)
 
 
 def test_validate_compatibility_table():
-    model_pkgs, compat = get_model_pkgs()
-    spacy_version = get_minor_version(about.__version__)
-    current_compat = compat.get(spacy_version, {})
-    assert len(current_compat) > 0
-    assert "en_core_web_sm" in current_compat
+    spec = SpecifierSet("==" + about.__version__)
+    spec.prereleases = False
+    if about.__version__ in spec:
+        model_pkgs, compat = get_model_pkgs()
+        spacy_version = get_minor_version(about.__version__)
+        current_compat = compat.get(spacy_version, {})
+        assert len(current_compat) > 0
+        assert "en_core_web_sm" in current_compat
 
 
 @pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"])

From 3f181b73d027be4fc731313b846da73700367360 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 20 Oct 2021 10:18:02 +0200
Subject: [PATCH 032/133] Add ja_core_news_trf to website (#9515)

---
 website/meta/languages.json | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/website/meta/languages.json b/website/meta/languages.json
index 2ba117d53..a7dda6482 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -192,17 +192,10 @@
             "models": [
                 "ja_core_news_sm",
                 "ja_core_news_md",
-                "ja_core_news_lg"
+                "ja_core_news_lg",
+                "ja_core_news_trf"
             ],
             "dependencies": [
-                {
-                    "name": "Unidic",
-                    "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj"
-                },
-                {
-                    "name": "Mecab",
-                    "url": "https://github.com/taku910/mecab"
-                },
                 {
                     "name": "SudachiPy",
                     "url": "https://github.com/WorksApplications/SudachiPy"

From 2ea9b58006f50b1e5b8bb8ebd23b90d29bbb681f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 27 Oct 2021 13:02:25 +0200
Subject: [PATCH 033/133] Ignore prefix in suffix matches (#9155)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Ignore prefix in suffix matches

Ignore the currently matched prefix when looking for suffix matches in
the tokenizer. Otherwise a lookbehind in the suffix pattern may match
incorrectly due the presence of the prefix in the token string.

* Move °[cfkCFK]. to a tokenizer exception

* Adjust exceptions for same tokenization as v3.1

* Also update test accordingly

* Continue to split . after °CFK if ° is not a prefix

* Exclude new ° exceptions for pl

* Switch back to default tokenization of "° C ."

* Revert "Exclude new ° exceptions for pl"

This reverts commit 952013a5b4114ca0ed3b65285f50e8ef05c1695a.

* Add exceptions for °C for hu
---
 spacy/lang/hu/tokenizer_exceptions.py    |  5 +++++
 spacy/lang/tokenizer_exceptions.py       |  6 ++++++
 spacy/tests/tokenizer/test_exceptions.py |  6 ++++++
 spacy/tests/tokenizer/test_tokenizer.py  | 19 ++++++++++++++++++-
 spacy/tokenizer.pyx                      |  2 +-
 5 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py
index 4a64a1d2c..ffaa74f50 100644
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@@ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
 )
 
 
+for u in "cfkCFK":
+    _exc[f"°{u}"] = [{ORTH: f"°{u}"}]
+    _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
+
+
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index e41db911f..d76fe4262 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -250,3 +250,9 @@ o.0
 
 for orth in emoticons:
     BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
+
+
+# Moved from a suffix setting due to #9155 removing prefixes from consideration
+# for lookbehinds
+for u in "cfkCFK":
+    BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index 9a98e049e..85716377a 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length):
     if sys.maxunicode >= 1114111:
         tokens = tokenizer(text)
         assert len(tokens) == length
+
+
+def test_tokenizer_degree(tokenizer):
+    for u in "cfkCFK":
+        assert [t.text for t in tokenizer(f"°{u}.")] == ["°", f"{u}", "."]
+        assert [t[1] for t in tokenizer.explain(f"°{u}.")] == ["°", f"{u}", "."]
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 7d0c16745..192faa67b 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -2,7 +2,7 @@ import pytest
 import re
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
-from spacy.util import ensure_path
+from spacy.util import ensure_path, compile_prefix_regex, compile_suffix_regex
 from spacy.lang.en import English
 
 
@@ -212,3 +212,20 @@ def test_tokenizer_flush_specials(en_vocab):
     assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
     tokenizer1.rules = {}
     assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
+
+
+def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
+    # the prefix and suffix matches overlap in the suffix lookbehind
+    prefixes = ['a(?=.)']
+    suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.']
+    prefix_re = compile_prefix_regex(prefixes)
+    suffix_re = compile_suffix_regex(suffixes)
+    tokenizer = Tokenizer(
+        en_vocab,
+        prefix_search=prefix_re.search,
+        suffix_search=suffix_re.search,
+    )
+    tokens = [t.text for t in tokenizer("a10.")]
+    assert tokens == ["a", "10", "."]
+    explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
+    assert tokens == explain_tokens
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index c0c8520c7..f8df13610 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -408,7 +408,7 @@ cdef class Tokenizer:
                     string = minus_pre
                     prefixes.push_back(self.vocab.get(mem, prefix))
                     break
-            suf_len = self.find_suffix(string)
+            suf_len = self.find_suffix(string[pre_len:])
             if suf_len != 0:
                 suffix = string[-suf_len:]
                 minus_suf = string[:-suf_len]

From 0c97ed2746d13f06f98e5e84ba52f68a35763d23 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 27 Oct 2021 13:13:03 +0200
Subject: [PATCH 034/133] Rename ja morph features to Inflection and Reading
 (#9520)

* Rename ja morph features to Inflection and Reading
---
 spacy/lang/ja/__init__.py             | 4 ++--
 spacy/tests/lang/ja/test_tokenizer.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 33335a189..127c4c8ac 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -80,12 +80,12 @@ class JapaneseTokenizer(DummyTokenizer):
             morph = {}
             if dtoken.inf:
                 # it's normal for this to be empty for non-inflecting types
-                morph["inflection"] = dtoken.inf
+                morph["Inflection"] = dtoken.inf
             token.norm_ = dtoken.norm
             if dtoken.reading:
                 # punctuation is its own reading, but we don't want values like
                 # "=" here
-                morph["reading"] = re.sub("[=|]", "_", dtoken.reading)
+                morph["Reading"] = re.sub("[=|]", "_", dtoken.reading)
             token.morph = MorphAnalysis(self.vocab, morph)
         if self.need_subtokens:
             doc.user_data["sub_tokens"] = sub_tokens_list
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index eb170061a..098884cf0 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -144,9 +144,9 @@ def test_ja_tokenizer_inflections_reading_forms(
     ja_tokenizer, text, inflections, reading_forms
 ):
     tokens = ja_tokenizer(text)
-    test_inflections = [tt.morph.get("inflection") for tt in tokens]
+    test_inflections = [tt.morph.get("Inflection") for tt in tokens]
     assert test_inflections == list(inflections)
-    test_readings = [tt.morph.get("reading") for tt in tokens]
+    test_readings = [tt.morph.get("Reading") for tt in tokens]
     assert test_readings == list(reading_forms)
 
 

From c053f158c582137abc25c9dc3b5e30b4a3708916 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 27 Oct 2021 14:08:31 +0200
Subject: [PATCH 035/133] Add support for floret vectors (#8909)

* Add support for fasttext-bloom hash-only vectors

Overview:

* Extend `Vectors` to have two modes: `default` and `ngram`
  * `default` is the default mode and equivalent to the current
    `Vectors`
  * `ngram` supports the hash-only ngram tables from `fasttext-bloom`
* Extend `spacy.StaticVectors.v2` to handle both modes with no changes
  for `default` vectors
* Extend `spacy init vectors` to support ngram tables

The `ngram` mode **only** supports vector tables produced by this
fork of fastText, which adds an option to represent all vectors using
only the ngram buckets table and which uses the exact same ngram
generation algorithm and hash function (`MurmurHash3_x64_128`).
`fasttext-bloom` produces an additional `.hashvec` table, which can be
loaded by `spacy init vectors --fasttext-bloom-vectors`.

https://github.com/adrianeboyd/fastText/tree/feature/bloom

Implementation details:

* `Vectors` now includes the `StringStore` as `Vectors.strings` so that
  the API can stay consistent for both `default` (which can look up from
  `str` or `int`) and `ngram` (which requires `str` to calculate the
  ngrams).

* In ngram mode `Vectors` uses a default `Vectors` object as a cache
  since the ngram vectors lookups are relatively expensive.

  * The default cache size is the same size as the provided ngram vector
    table.

  * Once the cache is full, no more entries are added. The user is
    responsible for managing the cache in cases where the initial
    documents are not representative of the texts.

  * The cache can be resized by setting `Vectors.ngram_cache_size` or
    cleared with `vectors._ngram_cache.clear()`.

* The API ends up a bit split between methods for `default` and for
  `ngram`, so functions that only make sense for `default` or `ngram`
  include warnings with custom messages suggesting alternatives where
  possible.

* `Vocab.vectors` becomes a property so that the string stores can be
  synced when assigning vectors to a vocab.

* `Vectors` serializes its own config settings as `vectors.cfg`.

* The `Vectors` serialization methods have added support for `exclude`
  so that the `Vocab` can exclude the `Vectors` strings while serializing.

Removed:

* The `minn` and `maxn` options and related code from
  `Vocab.get_vector`, which does not work in a meaningful way for default
  vector tables.

* The unused `GlobalRegistry` in `Vectors`.

* Refactor to use reduce_mean

Refactor to use reduce_mean and remove the ngram vectors cache.

* Rename to floret

* Rename to floret in error messages

* Use --vectors-mode in CLI, vector init

* Fix vectors mode in init

* Remove unused var

* Minor API and docstrings adjustments

* Rename `--vectors-mode` to `--mode` in `init vectors` CLI
* Rename `Vectors.get_floret_vectors` to `Vectors.get_batch` and support
  both modes.
* Minor updates to Vectors docstrings.

* Update API docs for Vectors and init vectors CLI

* Update types for StaticVectors
---
 spacy/cli/init_pipeline.py                    |  10 +-
 spacy/errors.py                               |  17 +-
 spacy/language.py                             |   1 +
 spacy/ml/staticvectors.py                     |  36 ++-
 .../serialize/test_serialize_vocab_strings.py |   6 +
 spacy/tests/vocab_vectors/test_vectors.py     | 220 ++++++++++---
 spacy/tokens/doc.pyi                          |   4 +-
 spacy/tokens/doc.pyx                          |   8 +-
 spacy/training/initialize.py                  |  71 ++++-
 spacy/vectors.pyx                             | 294 ++++++++++++++----
 spacy/vocab.pxd                               |   2 +-
 spacy/vocab.pyx                               |  74 ++---
 website/docs/api/cli.md                       |   1 +
 website/docs/api/vectors.md                   | 102 ++++--
 14 files changed, 644 insertions(+), 202 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 2a920cdda..d53a61b8e 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -20,6 +20,7 @@ def init_vectors_cli(
     output_dir: Path = Arg(..., help="Pipeline output directory"),
     prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
+    mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
@@ -34,7 +35,14 @@ def init_vectors_cli(
     nlp = util.get_lang_class(lang)()
     if jsonl_loc is not None:
         update_lexemes(nlp, jsonl_loc)
-    convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
+    convert_vectors(
+        nlp,
+        vectors_loc,
+        truncate=truncate,
+        prune=prune,
+        name=name,
+        mode=mode,
+    )
     msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
     nlp.to_disk(output_dir)
     msg.good(
diff --git a/spacy/errors.py b/spacy/errors.py
index 4fe3e9003..5cdd2bedd 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -27,6 +27,9 @@ def setup_default_warnings():
     # warn once about lemmatizer without required POS
     filter_warning("once", error_msg=Warnings.W108)
 
+    # floret vector table cannot be modified
+    filter_warning("once", error_msg="[W114]")
+
 
 def filter_warning(action: str, error_msg: str):
     """Customize how spaCy should handle a certain warning.
@@ -192,6 +195,8 @@ class Warnings:
             "vectors are not identical to current pipeline vectors.")
     W114 = ("Using multiprocessing with GPU models is not recommended and may "
             "lead to errors.")
+    W115 = ("Skipping {method}: the floret vector table cannot be modified. "
+            "Vectors are calculated from character ngrams.")
 
 
 @add_codes
@@ -518,9 +523,19 @@ class Errors:
     E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
     E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
             "issue tracker: http://github.com/explosion/spaCy/issues")
-    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
+    E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
+    E858 = ("The {mode} vector table does not support this operation. "
+            "{alternative}")
+    E859 = ("The floret vector table cannot be modified.")
+    E860 = ("Can't truncate fasttext-bloom vectors.")
+    E861 = ("No 'keys' should be provided when initializing floret vectors "
+            "with 'minn' and 'maxn'.")
+    E862 = ("'hash_count' must be between 1-4 for floret vectors.")
+    E863 = ("'maxn' must be greater than or equal to 'minn'.")
+    E864 = ("The complete vector table 'data' is required to initialize floret "
+            "vectors.")
     E865 = ("A SpanGroup is not functional after the corresponding Doc has "
             "been garbage collected. To keep using the spans, make sure that "
             "the corresponding Doc object is still available in the scope of "
diff --git a/spacy/language.py b/spacy/language.py
index 83de83702..64d5d5a8a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -228,6 +228,7 @@ class Language:
             "vectors": len(self.vocab.vectors),
             "keys": self.vocab.vectors.n_keys,
             "name": self.vocab.vectors.name,
+            "mode": self.vocab.vectors.mode,
         }
         self._meta["labels"] = dict(self.pipe_labels)
         # TODO: Adding this back to prevent breaking people's code etc., but
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 53ef01906..741007bdb 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -1,11 +1,13 @@
-from typing import List, Tuple, Callable, Optional, cast
+from typing import List, Tuple, Callable, Optional, Sequence, cast
 from thinc.initializers import glorot_uniform_init
 from thinc.util import partial
-from thinc.types import Ragged, Floats2d, Floats1d
+from thinc.types import Ragged, Floats2d, Floats1d, Ints1d
 from thinc.api import Model, Ops, registry
 
 from ..tokens import Doc
 from ..errors import Errors
+from ..vectors import Mode
+from ..vocab import Vocab
 
 
 @registry.layers("spacy.StaticVectors.v2")
@@ -34,20 +36,32 @@ def StaticVectors(
 def forward(
     model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
 ) -> Tuple[Ragged, Callable]:
-    if not sum(len(doc) for doc in docs):
+    token_count = sum(len(doc) for doc in docs)
+    if not token_count:
         return _handle_empty(model.ops, model.get_dim("nO"))
-    key_attr = model.attrs["key_attr"]
-    W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
-    V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data))
-    rows = model.ops.flatten(
-        [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
+    key_attr: int = model.attrs["key_attr"]
+    keys: Ints1d = model.ops.flatten(
+        cast(Sequence, [doc.to_array(key_attr) for doc in docs])
     )
+    vocab: Vocab = docs[0].vocab
+    W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
+    if vocab.vectors.mode == Mode.default:
+        V = cast(Floats2d, model.ops.asarray(vocab.vectors.data))
+        rows = vocab.vectors.find(keys=keys)
+        V = model.ops.as_contig(V[rows])
+    elif vocab.vectors.mode == Mode.floret:
+        V = cast(Floats2d, vocab.vectors.get_batch(keys))
+        V = model.ops.as_contig(V)
+    else:
+        raise RuntimeError(Errors.E896)
     try:
-        vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
+        vectors_data = model.ops.gemm(V, W, trans2=True)
     except ValueError:
         raise RuntimeError(Errors.E896)
-    # Convert negative indices to 0-vectors (TODO: more options for UNK tokens)
-    vectors_data[rows < 0] = 0
+    if vocab.vectors.mode == Mode.default:
+        # Convert negative indices to 0-vectors
+        # TODO: more options for UNK tokens
+        vectors_data[rows < 0] = 0
     output = Ragged(
         vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i")  # type: ignore
     )
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index 3fe9363bf..ab403ab54 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -1,7 +1,9 @@
 import pytest
 import pickle
+from thinc.api import get_current_ops
 from spacy.vocab import Vocab
 from spacy.strings import StringStore
+from spacy.vectors import Vectors
 
 from ..util import make_tempdir
 
@@ -129,7 +131,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
 @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 def test_pickle_vocab(strings, lex_attr):
     vocab = Vocab(strings=strings)
+    ops = get_current_ops()
+    vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
+    vocab.vectors = vectors
     vocab[strings[0]].norm_ = lex_attr
     vocab_pickled = pickle.dumps(vocab)
     vocab_unpickled = pickle.loads(vocab_pickled)
     assert vocab.to_bytes() == vocab_unpickled.to_bytes()
+    assert vocab_unpickled.vectors.mode == "floret"
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 23597455f..f2e74c3c9 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -1,12 +1,14 @@
 import pytest
 import numpy
-from numpy.testing import assert_allclose, assert_equal
+from numpy.testing import assert_allclose, assert_equal, assert_almost_equal
 from thinc.api import get_current_ops
+from spacy.lang.en import English
 from spacy.vocab import Vocab
 from spacy.vectors import Vectors
 from spacy.tokenizer import Tokenizer
 from spacy.strings import hash_string  # type: ignore
 from spacy.tokens import Doc
+from spacy.training.initialize import convert_vectors
 
 from ..util import add_vecs_to_vocab, get_cosine, make_tempdir
 
@@ -29,22 +31,6 @@ def vectors():
     ]
 
 
-@pytest.fixture
-def ngrams_vectors():
-    return [
-        ("apple", OPS.asarray([1, 2, 3])),
-        ("app", OPS.asarray([-0.1, -0.2, -0.3])),
-        ("ppl", OPS.asarray([-0.2, -0.3, -0.4])),
-        ("pl", OPS.asarray([0.7, 0.8, 0.9])),
-    ]
-
-
-@pytest.fixture()
-def ngrams_vocab(en_vocab, ngrams_vectors):
-    add_vecs_to_vocab(en_vocab, ngrams_vectors)
-    return en_vocab
-
-
 @pytest.fixture
 def data():
     return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f")
@@ -125,6 +111,7 @@ def test_init_vectors_with_data(strings, data):
 def test_init_vectors_with_shape(strings):
     v = Vectors(shape=(len(strings), 3))
     assert v.shape == (len(strings), 3)
+    assert v.is_full is False
 
 
 def test_get_vector(strings, data):
@@ -180,30 +167,6 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
     assert all([a == b for a, b in zip(vectors[1][1], doc[2].vector)])
 
 
-@pytest.mark.parametrize("text", ["apple"])
-def test_vectors__ngrams_word(ngrams_vocab, ngrams_vectors, text):
-    assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors[0][1])
-
-
-@pytest.mark.parametrize("text", ["applpie"])
-def test_vectors__ngrams_subword(ngrams_vocab, ngrams_vectors, text):
-    truth = list(ngrams_vocab.get_vector(text, 1, 6))
-    test = list(
-        [
-            (
-                ngrams_vectors[1][1][i]
-                + ngrams_vectors[2][1][i]
-                + ngrams_vectors[3][1][i]
-            )
-            / 3
-            for i in range(len(ngrams_vectors[1][1]))
-        ]
-    )
-    eps = [abs(truth[i] - test[i]) for i in range(len(truth))]
-    for i in eps:
-        assert i < 1e-6
-
-
 @pytest.mark.parametrize("text", ["apple", "orange"])
 def test_vectors_lexeme_vector(vocab, text):
     lex = vocab[text]
@@ -379,3 +342,178 @@ def test_vector_is_oov():
     assert vocab["cat"].is_oov is False
     assert vocab["dog"].is_oov is False
     assert vocab["hamster"].is_oov is True
+
+
+def test_init_vectors_unset():
+    v = Vectors(shape=(10, 10))
+    assert v.is_full is False
+    assert v.data.shape == (10, 10)
+
+    with pytest.raises(ValueError):
+        v = Vectors(shape=(10, 10), mode="floret")
+
+    v = Vectors(data=OPS.xp.zeros((10, 10)), mode="floret", hash_count=1)
+    assert v.is_full is True
+
+
+def test_vectors_clear():
+    data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
+    v = Vectors(data=data, keys=["A", "B", "C"])
+    assert v.is_full is True
+    assert hash_string("A") in v
+    v.clear()
+    # no keys
+    assert v.key2row == {}
+    assert list(v) == []
+    assert v.is_full is False
+    assert "A" not in v
+    with pytest.raises(KeyError):
+        v["A"]
+
+
+def test_vectors_get_batch():
+    data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
+    v = Vectors(data=data, keys=["A", "B", "C"])
+    # check with mixed int/str keys
+    words = ["C", "B", "A", v.strings["B"]]
+    rows = v.find(keys=words)
+    vecs = OPS.as_contig(v.data[rows])
+    assert_equal(OPS.to_numpy(vecs), OPS.to_numpy(v.get_batch(words)))
+
+
+@pytest.fixture()
+def floret_vectors_hashvec_str():
+    """The full hashvec table from floret with the settings:
+    bucket 10, dim 10, minn 2, maxn 3, hash count 2, hash seed 2166136261,
+    bow <, eow >"""
+    return """10 10 2 3 2 2166136261 < >
+0 -2.2611 3.9302 2.6676 -11.233 0.093715 -10.52 -9.6463 -0.11853 2.101 -0.10145
+1 -3.12 -1.7981 10.7 -6.171 4.4527 10.967 9.073 6.2056 -6.1199 -2.0402
+2 9.5689 5.6721 -8.4832 -1.2249 2.1871 -3.0264 -2.391 -5.3308 -3.2847 -4.0382
+3 3.6268 4.2759 -1.7007 1.5002 5.5266 1.8716 -12.063 0.26314 2.7645 2.4929
+4 -11.683 -7.7068 2.1102 2.214 7.2202 0.69799 3.2173 -5.382 -2.0838 5.0314
+5 -4.3024 8.0241 2.0714 -1.0174 -0.28369 1.7622 7.8797 -1.7795 6.7541 5.6703
+6 8.3574 -5.225 8.6529 8.5605 -8.9465 3.767 -5.4636 -1.4635 -0.98947 -0.58025
+7 -10.01 3.3894 -4.4487 1.1669 -11.904 6.5158 4.3681 0.79913 -6.9131 -8.687
+8 -5.4576 7.1019 -8.8259 1.7189 4.955 -8.9157 -3.8905 -0.60086 -2.1233 5.892
+9 8.0678 -4.4142 3.6236 4.5889 -2.7611 2.4455 0.67096 -4.2822 2.0875 4.6274
+"""
+
+
+@pytest.fixture()
+def floret_vectors_vec_str():
+    """The top 10 rows from floret with the settings above, to verify
+    that the spacy floret vectors are equivalent to the fasttext static
+    vectors."""
+    return """10 10
+, -5.7814 2.6918 0.57029 -3.6985 -2.7079 1.4406 1.0084 1.7463 -3.8625 -3.0565
+. 3.8016 -1.759 0.59118 3.3044 -0.72975 0.45221 -2.1412 -3.8933 -2.1238 -0.47409
+der 0.08224 2.6601 -1.173 1.1549 -0.42821 -0.097268 -2.5589 -1.609 -0.16968 0.84687
+die -2.8781 0.082576 1.9286 -0.33279 0.79488 3.36 3.5609 -0.64328 -2.4152 0.17266
+und 2.1558 1.8606 -1.382 0.45424 -0.65889 1.2706 0.5929 -2.0592 -2.6949 -1.6015
+" -1.1242 1.4588 -1.6263 1.0382 -2.7609 -0.99794 -0.83478 -1.5711 -1.2137 1.0239
+in -0.87635 2.0958 4.0018 -2.2473 -1.2429 2.3474 1.8846 0.46521 -0.506 -0.26653
+von -0.10589 1.196 1.1143 -0.40907 -1.0848 -0.054756 -2.5016 -1.0381 -0.41598 0.36982
+( 0.59263 2.1856 0.67346 1.0769 1.0701 1.2151 1.718 -3.0441 2.7291 3.719
+) 0.13812 3.3267 1.657 0.34729 -3.5459 0.72372 0.63034 -1.6145 1.2733 0.37798
+"""
+
+
+def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
+    nlp = English()
+    nlp_plain = English()
+    # load both vec and hashvec tables
+    with make_tempdir() as tmpdir:
+        p = tmpdir / "test.hashvec"
+        with open(p, "w") as fileh:
+            fileh.write(floret_vectors_hashvec_str)
+        convert_vectors(nlp, p, truncate=0, prune=-1, mode="floret")
+        p = tmpdir / "test.vec"
+        with open(p, "w") as fileh:
+            fileh.write(floret_vectors_vec_str)
+        convert_vectors(nlp_plain, p, truncate=0, prune=-1)
+
+    word = "der"
+    # ngrams: full padded word + padded 2-grams + padded 3-grams
+    ngrams = nlp.vocab.vectors._get_ngrams(word)
+    assert ngrams == ["<der>", "<d", "de", "er", "r>", "<de", "der", "er>"]
+    # rows: 2 rows per ngram
+    rows = OPS.xp.asarray(
+        [
+            h % nlp.vocab.vectors.data.shape[0]
+            for ngram in ngrams
+            for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
+        ],
+        dtype="uint32",
+    )
+    assert_equal(
+        OPS.to_numpy(rows),
+        numpy.asarray([5, 6, 7, 5, 8, 2, 8, 9, 3, 3, 4, 6, 7, 3, 0, 2]),
+    )
+    assert len(rows) == len(ngrams) * nlp.vocab.vectors.hash_count
+    # all vectors are equivalent for plain static table vs. hash ngrams
+    for word in nlp_plain.vocab.vectors:
+        word = nlp_plain.vocab.strings.as_string(word)
+        assert_almost_equal(
+            nlp.vocab[word].vector, nlp_plain.vocab[word].vector, decimal=3
+        )
+
+        # every word has a vector
+        assert nlp.vocab[word * 5].has_vector
+
+    # check that single and batched vector lookups are identical
+    words = [s for s in nlp_plain.vocab.vectors]
+    single_vecs = OPS.to_numpy(OPS.asarray([nlp.vocab[word].vector for word in words]))
+    batch_vecs = OPS.to_numpy(nlp.vocab.vectors.get_batch(words))
+    assert_equal(single_vecs, batch_vecs)
+
+    # an empty key returns 0s
+    assert_equal(
+        OPS.to_numpy(nlp.vocab[""].vector),
+        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+    )
+    # an empty batch returns 0s
+    assert_equal(
+        OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
+        numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
+    )
+    # an empty key within a batch returns 0s
+    assert_equal(
+        OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
+        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+    )
+
+    # the loaded ngram vector table cannot be modified
+    # except for clear: warning, then return without modifications
+    vector = list(range(nlp.vocab.vectors.shape[1]))
+    orig_bytes = nlp.vocab.vectors.to_bytes(exclude=["strings"])
+    with pytest.warns(UserWarning):
+        nlp.vocab.set_vector("the", vector)
+    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
+    with pytest.warns(UserWarning):
+        nlp.vocab[word].vector = vector
+    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
+    with pytest.warns(UserWarning):
+        nlp.vocab.vectors.add("the", row=6)
+    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
+    with pytest.warns(UserWarning):
+        nlp.vocab.vectors.resize(shape=(100, 10))
+    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
+    with pytest.raises(ValueError):
+        nlp.vocab.vectors.clear()
+
+    # data and settings are serialized correctly
+    with make_tempdir() as d:
+        nlp.vocab.to_disk(d)
+        vocab_r = Vocab()
+        vocab_r.from_disk(d)
+        assert nlp.vocab.vectors.to_bytes() == vocab_r.vectors.to_bytes()
+        assert_equal(
+            OPS.to_numpy(nlp.vocab.vectors.data), OPS.to_numpy(vocab_r.vectors.data)
+        )
+        assert_equal(nlp.vocab.vectors._get_cfg(), vocab_r.vectors._get_cfg())
+        assert_almost_equal(
+            OPS.to_numpy(nlp.vocab[word].vector),
+            OPS.to_numpy(vocab_r[word].vector),
+            decimal=6,
+        )
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 2b18cee7a..0fa1d0d4f 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -138,8 +138,8 @@ class Doc:
     def count_by(
         self, attr_id: int, exclude: Optional[Any] = ..., counts: Optional[Any] = ...
     ) -> Dict[Any, int]: ...
-    def from_array(self, attrs: List[int], array: Ints2d) -> Doc: ...
-    def to_array(self, py_attr_ids: List[int]) -> numpy.ndarray: ...
+    def from_array(self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d) -> Doc: ...
+    def to_array(self, py_attr_ids: Union[int, str, List[Union[int, str]]]) -> numpy.ndarray: ...
     @staticmethod
     def from_docs(
         docs: List[Doc],
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index d65d18f48..d74bd7f6e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -534,7 +534,13 @@ cdef class Doc:
             kb_id = self.vocab.strings.add(kb_id)
         alignment_modes = ("strict", "contract", "expand")
         if alignment_mode not in alignment_modes:
-            raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes)))
+            raise ValueError(
+                Errors.E202.format(
+                    name="alignment",
+                    mode=alignment_mode,
+                    modes=", ".join(alignment_modes),
+                )
+            )
         cdef int start = token_by_char(self.c, self.length, start_idx)
         if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
             return None
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 96abcc7cd..13ccfeb93 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -13,7 +13,7 @@ import warnings
 
 from .pretrain import get_tok2vec_ref
 from ..lookups import Lookups
-from ..vectors import Vectors
+from ..vectors import Vectors, Mode as VectorsMode
 from ..errors import Errors, Warnings
 from ..schemas import ConfigSchemaTraining
 from ..util import registry, load_model_from_config, resolve_dot_names, logger
@@ -160,7 +160,13 @@ def load_vectors_into_model(
         err = ConfigValidationError.from_error(e, title=title, desc=desc)
         raise err from None
 
-    if len(vectors_nlp.vocab.vectors.keys()) == 0:
+    if (
+        len(vectors_nlp.vocab.vectors.keys()) == 0
+        and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
+    ) or (
+        vectors_nlp.vocab.vectors.data.shape[0] == 0
+        and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
+    ):
         logger.warning(Warnings.W112.format(name=name))
 
     for lex in nlp.vocab:
@@ -197,41 +203,80 @@ def convert_vectors(
     truncate: int,
     prune: int,
     name: Optional[str] = None,
+    mode: str = VectorsMode.default,
 ) -> None:
     vectors_loc = ensure_path(vectors_loc)
     if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
-        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
+        nlp.vocab.vectors = Vectors(
+            strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
+        )
         for lex in nlp.vocab:
             if lex.rank and lex.rank != OOV_RANK:
                 nlp.vocab.vectors.add(lex.orth, row=lex.rank)  # type: ignore[attr-defined]
     else:
         if vectors_loc:
             logger.info(f"Reading vectors from {vectors_loc}")
-            vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
+            vectors_data, vector_keys, floret_settings = read_vectors(
+                vectors_loc,
+                truncate,
+                mode=mode,
+            )
             logger.info(f"Loaded vectors from {vectors_loc}")
         else:
             vectors_data, vector_keys = (None, None)
-        if vector_keys is not None:
+        if vector_keys is not None and mode != VectorsMode.floret:
             for word in vector_keys:
                 if word not in nlp.vocab:
                     nlp.vocab[word]
         if vectors_data is not None:
-            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
+            if mode == VectorsMode.floret:
+                nlp.vocab.vectors = Vectors(
+                    strings=nlp.vocab.strings,
+                    data=vectors_data,
+                    **floret_settings,
+                )
+            else:
+                nlp.vocab.vectors = Vectors(
+                    strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
+                )
     if name is None:
         # TODO: Is this correct? Does this matter?
         nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
     else:
         nlp.vocab.vectors.name = name
     nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
-    if prune >= 1:
+    if prune >= 1 and mode != VectorsMode.floret:
         nlp.vocab.prune_vectors(prune)
 
 
-def read_vectors(vectors_loc: Path, truncate_vectors: int):
+def read_vectors(
+    vectors_loc: Path, truncate_vectors: int, *, mode: str = VectorsMode.default
+):
     f = ensure_shape(vectors_loc)
-    shape = tuple(int(size) for size in next(f).split())
-    if truncate_vectors >= 1:
-        shape = (truncate_vectors, shape[1])
+    header_parts = next(f).split()
+    shape = tuple(int(size) for size in header_parts[:2])
+    floret_settings = {}
+    if mode == VectorsMode.floret:
+        if len(header_parts) != 8:
+            raise ValueError(
+                "Invalid header for floret vectors. "
+                "Expected: bucket dim minn maxn hash_count hash_seed BOW EOW"
+            )
+        floret_settings = {
+            "mode": "floret",
+            "minn": int(header_parts[2]),
+            "maxn": int(header_parts[3]),
+            "hash_count": int(header_parts[4]),
+            "hash_seed": int(header_parts[5]),
+            "bow": header_parts[6],
+            "eow": header_parts[7],
+        }
+        if truncate_vectors >= 1:
+            raise ValueError(Errors.E860)
+    else:
+        assert len(header_parts) == 2
+        if truncate_vectors >= 1:
+            shape = (truncate_vectors, shape[1])
     vectors_data = numpy.zeros(shape=shape, dtype="f")
     vectors_keys = []
     for i, line in enumerate(tqdm.tqdm(f)):
@@ -244,7 +289,7 @@ def read_vectors(vectors_loc: Path, truncate_vectors: int):
         vectors_keys.append(word)
         if i == truncate_vectors - 1:
             break
-    return vectors_data, vectors_keys
+    return vectors_data, vectors_keys, floret_settings
 
 
 def open_file(loc: Union[str, Path]) -> IO:
@@ -271,7 +316,7 @@ def ensure_shape(vectors_loc):
     lines = open_file(vectors_loc)
     first_line = next(lines)
     try:
-        shape = tuple(int(size) for size in first_line.split())
+        shape = tuple(int(size) for size in first_line.split()[:2])
     except ValueError:
         shape = None
     if shape is not None:
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 7cb3322c2..6d6783af4 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,16 +1,23 @@
 cimport numpy as np
+from libc.stdint cimport uint32_t
 from cython.operator cimport dereference as deref
 from libcpp.set cimport set as cppset
+from murmurhash.mrmr cimport hash128_x64
 
 import functools
 import numpy
+from typing import cast
+import warnings
+from enum import Enum
 import srsly
 from thinc.api import get_array_module, get_current_ops
+from thinc.backends import get_array_ops
+from thinc.types import Floats2d
 
 from .strings cimport StringStore
 
 from .strings import get_string_id
-from .errors import Errors
+from .errors import Errors, Warnings
 from . import util
 
 
@@ -18,18 +25,13 @@ def unpickle_vectors(bytes_data):
     return Vectors().from_bytes(bytes_data)
 
 
-class GlobalRegistry:
-    """Global store of vectors, to avoid repeatedly loading the data."""
-    data = {}
+class Mode(str, Enum):
+    default = "default"
+    floret = "floret"
 
     @classmethod
-    def register(cls, name, data):
-        cls.data[name] = data
-        return functools.partial(cls.get, name)
-
-    @classmethod
-    def get(cls, name):
-        return cls.data[name]
+    def values(cls):
+        return list(cls.__members__.keys())
 
 
 cdef class Vectors:
@@ -37,45 +39,93 @@ cdef class Vectors:
 
     Vectors data is kept in the vectors.data attribute, which should be an
     instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
-    (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
-    rows in the vectors.data table.
+    (for GPU vectors).
 
-    Multiple keys can be mapped to the same vector, and not all of the rows in
-    the table need to be assigned - so len(list(vectors.keys())) may be
-    greater or smaller than vectors.shape[0].
+    In the default mode, `vectors.key2row` is a dictionary mapping word hashes
+    to rows in the vectors.data table. Multiple keys can be mapped to the same
+    vector, and not all of the rows in the table need to be assigned - so
+    len(list(vectors.keys())) may be greater or smaller than vectors.shape[0].
+
+    In floret mode, the floret settings (minn, maxn, etc.) are used to
+    calculate the vector from the rows corresponding to the key's ngrams.
 
     DOCS: https://spacy.io/api/vectors
     """
+    cdef public object strings
     cdef public object name
+    cdef readonly object mode
     cdef public object data
     cdef public object key2row
     cdef cppset[int] _unset
+    cdef readonly uint32_t minn
+    cdef readonly uint32_t maxn
+    cdef readonly uint32_t hash_count
+    cdef readonly uint32_t hash_seed
+    cdef readonly unicode bow
+    cdef readonly unicode eow
 
-    def __init__(self, *, shape=None, data=None, keys=None, name=None):
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
         """Create a new vector store.
 
+        strings (StringStore): The string store.
         shape (tuple): Size of the table, as (# entries, # columns)
         data (numpy.ndarray or cupy.ndarray): The vector data.
         keys (iterable): A sequence of keys, aligned with the data.
         name (str): A name to identify the vectors table.
+        mode (str): Vectors mode: "default" or "floret" (default: "default").
+        minn (int): The floret char ngram minn (default: 0).
+        maxn (int): The floret char ngram maxn (default: 0).
+        hash_count (int): The floret hash count (1-4, default: 1).
+        hash_seed (int): The floret hash seed (default: 0).
+        bow (str): The floret BOW string (default: "<").
+        eow (str): The floret EOW string (default: ">").
 
         DOCS: https://spacy.io/api/vectors#init
         """
+        self.strings = strings
+        if self.strings is None:
+            self.strings = StringStore()
         self.name = name
-        if data is None:
-            if shape is None:
-                shape = (0,0)
-            ops = get_current_ops()
-            data = ops.xp.zeros(shape, dtype="f")
-        self.data = data
+        if mode not in Mode.values():
+            raise ValueError(
+                Errors.E202.format(
+                    name="vectors",
+                    mode=mode,
+                    modes=str(Mode.values())
+                )
+            )
+        self.mode = Mode(mode).value
         self.key2row = {}
-        if self.data is not None:
-            self._unset = cppset[int]({i for i in range(self.data.shape[0])})
-        else:
+        self.minn = minn
+        self.maxn = maxn
+        self.hash_count = hash_count
+        self.hash_seed = hash_seed
+        self.bow = bow
+        self.eow = eow
+        if self.mode == Mode.default:
+            if data is None:
+                if shape is None:
+                    shape = (0,0)
+                ops = get_current_ops()
+                data = ops.xp.zeros(shape, dtype="f")
+                self._unset = cppset[int]({i for i in range(data.shape[0])})
+            else:
+                self._unset = cppset[int]()
+            self.data = data
+            if keys is not None:
+                for i, key in enumerate(keys):
+                    self.add(key, row=i)
+        elif self.mode == Mode.floret:
+            if maxn < minn:
+                raise ValueError(Errors.E863)
+            if hash_count < 1 or hash_count >= 5:
+                raise ValueError(Errors.E862)
+            if data is None:
+                raise ValueError(Errors.E864)
+            if keys is not None:
+                raise ValueError(Errors.E861)
+            self.data = data
             self._unset = cppset[int]()
-        if keys is not None:
-            for i, key in enumerate(keys):
-                self.add(key, row=i)
 
     @property
     def shape(self):
@@ -106,6 +156,8 @@ cdef class Vectors:
 
         DOCS: https://spacy.io/api/vectors#is_full
         """
+        if self.mode == Mode.floret:
+            return True
         return self._unset.size() == 0
 
     @property
@@ -113,7 +165,8 @@ cdef class Vectors:
         """Get the number of keys in the table. Note that this is the number
         of all keys, not just unique vectors.
 
-        RETURNS (int): The number of keys in the table.
+        RETURNS (int): The number of keys in the table for default vectors.
+        For floret vectors, return -1.
 
         DOCS: https://spacy.io/api/vectors#n_keys
         """
@@ -125,25 +178,33 @@ cdef class Vectors:
     def __getitem__(self, key):
         """Get a vector by key. If the key is not found, a KeyError is raised.
 
-        key (int): The key to get the vector for.
+        key (str/int): The key to get the vector for.
         RETURNS (ndarray): The vector for the key.
 
         DOCS: https://spacy.io/api/vectors#getitem
         """
-        i = self.key2row[key]
-        if i is None:
-            raise KeyError(Errors.E058.format(key=key))
-        else:
-            return self.data[i]
+        if self.mode == Mode.default:
+            i = self.key2row.get(get_string_id(key), None)
+            if i is None:
+                raise KeyError(Errors.E058.format(key=key))
+            else:
+                return self.data[i]
+        elif self.mode == Mode.floret:
+            return self.get_batch([key])[0]
+        raise KeyError(Errors.E058.format(key=key))
 
     def __setitem__(self, key, vector):
         """Set a vector for the given key.
 
-        key (int): The key to set the vector for.
+        key (str/int): The key to set the vector for.
         vector (ndarray): The vector to set.
 
         DOCS: https://spacy.io/api/vectors#setitem
         """
+        if self.mode == Mode.floret:
+            warnings.warn(Warnings.W115.format(method="Vectors.__setitem__"))
+            return
+        key = get_string_id(key)
         i = self.key2row[key]
         self.data[i] = vector
         if self._unset.count(i):
@@ -175,7 +236,10 @@ cdef class Vectors:
 
         DOCS: https://spacy.io/api/vectors#contains
         """
-        return key in self.key2row
+        if self.mode == Mode.floret:
+            return True
+        else:
+            return key in self.key2row
 
     def resize(self, shape, inplace=False):
         """Resize the underlying vectors array. If inplace=True, the memory
@@ -192,6 +256,9 @@ cdef class Vectors:
 
         DOCS: https://spacy.io/api/vectors#resize
         """
+        if self.mode == Mode.floret:
+            warnings.warn(Warnings.W115.format(method="Vectors.resize"))
+            return -1
         xp = get_array_module(self.data)
         if inplace:
             if shape[1] != self.data.shape[1]:
@@ -244,16 +311,23 @@ cdef class Vectors:
     def find(self, *, key=None, keys=None, row=None, rows=None):
         """Look up one or more keys by row, or vice versa.
 
-        key (str / int): Find the row that the given key points to.
+        key (Union[int, str]): Find the row that the given key points to.
             Returns int, -1 if missing.
-        keys (iterable): Find rows that the keys point to.
+        keys (Iterable[Union[int, str]]): Find rows that the keys point to.
             Returns ndarray.
         row (int): Find the first key that points to the row.
             Returns int.
-        rows (iterable): Find the keys that point to the rows.
+        rows (Iterable[int]): Find the keys that point to the rows.
             Returns ndarray.
         RETURNS: The requested key, keys, row or rows.
         """
+        if self.mode == Mode.floret:
+            raise ValueError(
+                Errors.E858.format(
+                    mode=self.mode,
+                    alternative="Use Vectors[key] instead.",
+                )
+            )
         if sum(arg is None for arg in (key, keys, row, rows)) != 3:
             bad_kwargs = {"key": key, "keys": keys, "row": row, "rows": rows}
             raise ValueError(Errors.E059.format(kwargs=bad_kwargs))
@@ -273,6 +347,67 @@ cdef class Vectors:
                 results = [row2key[row] for row in rows]
                 return xp.asarray(results, dtype="uint64")
 
+    def _get_ngram_hashes(self, unicode s):
+        """Calculate up to 4 32-bit hash values with MurmurHash3_x64_128 using
+        the floret hash settings.
+        key (str): The string key.
+        RETURNS: A list of the integer hashes.
+        """
+        cdef uint32_t[4] out
+        chars = s.encode("utf8")
+        cdef char* utf8_string = chars
+        hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
+        rows = [out[i] for i in range(min(self.hash_count, 4))]
+        return rows
+
+    def _get_ngrams(self, unicode key):
+        """Get all padded ngram strings using the ngram settings.
+        key (str): The string key.
+        RETURNS: A list of the ngram strings for the padded key.
+        """
+        key = self.bow + key + self.eow
+        ngrams = [key] + [
+            key[start:start+ngram_size]
+            for ngram_size in range(self.minn, self.maxn + 1)
+            for start in range(0, len(key) - ngram_size + 1)
+        ]
+        return ngrams
+
+    def get_batch(self, keys):
+        """Get the vectors for the provided keys efficiently as a batch.
+        keys (Iterable[Union[int, str]]): The keys.
+        RETURNS: The requested vectors from the vector table.
+        """
+        ops = get_array_ops(self.data)
+        if self.mode == Mode.default:
+            rows = self.find(keys=keys)
+            vecs = self.data[rows]
+        elif self.mode == Mode.floret:
+            keys = [self.strings.as_string(key) for key in keys]
+            if sum(len(key) for key in keys) == 0:
+                return ops.xp.zeros((len(keys), self.data.shape[1]))
+            unique_keys = tuple(set(keys))
+            row_index = {key: i for i, key in enumerate(unique_keys)}
+            rows = [row_index[key] for key in keys]
+            indices = []
+            lengths = []
+            for key in unique_keys:
+                if key == "":
+                    ngram_rows = []
+                else:
+                    ngram_rows = [
+                        h % self.data.shape[0]
+                        for ngram in self._get_ngrams(key)
+                        for h in self._get_ngram_hashes(ngram)
+                    ]
+                indices.extend(ngram_rows)
+                lengths.append(len(ngram_rows))
+            indices = ops.asarray(indices, dtype="int32")
+            lengths = ops.asarray(lengths, dtype="int32")
+            vecs = ops.reduce_mean(cast(Floats2d, self.data[indices]), lengths)
+            vecs = vecs[rows]
+        return ops.as_contig(vecs)
+
     def add(self, key, *, vector=None, row=None):
         """Add a key to the table. Keys can be mapped to an existing vector
         by setting `row`, or a new vector can be added.
@@ -284,6 +419,9 @@ cdef class Vectors:
 
         DOCS: https://spacy.io/api/vectors#add
         """
+        if self.mode == Mode.floret:
+            warnings.warn(Warnings.W115.format(method="Vectors.add"))
+            return -1
         # use int for all keys and rows in key2row for more efficient access
         # and serialization
         key = int(get_string_id(key))
@@ -324,6 +462,11 @@ cdef class Vectors:
         RETURNS (tuple): The most similar entries as a `(keys, best_rows, scores)`
             tuple.
         """
+        if self.mode == Mode.floret:
+            raise ValueError(Errors.E858.format(
+                mode=self.mode,
+                alternative="",
+            ))
         xp = get_array_module(self.data)
         filled = sorted(list({row for row in self.key2row.values()}))
         if len(filled) < n:
@@ -368,7 +511,32 @@ cdef class Vectors:
                     for i in range(len(queries)) ], dtype="uint64")
         return (keys, best_rows, scores)
 
-    def to_disk(self, path, **kwargs):
+    def _get_cfg(self):
+        if self.mode == Mode.default:
+            return {
+                "mode": Mode(self.mode).value,
+            }
+        elif self.mode == Mode.floret:
+            return {
+                "mode": Mode(self.mode).value,
+                "minn": self.minn,
+                "maxn": self.maxn,
+                "hash_count": self.hash_count,
+                "hash_seed": self.hash_seed,
+                "bow": self.bow,
+                "eow": self.eow,
+            }
+
+    def _set_cfg(self, cfg):
+        self.mode = Mode(cfg.get("mode", Mode.default)).value
+        self.minn = cfg.get("minn", 0)
+        self.maxn = cfg.get("maxn", 0)
+        self.hash_count = cfg.get("hash_count", 0)
+        self.hash_seed = cfg.get("hash_seed", 0)
+        self.bow = cfg.get("bow", "<")
+        self.eow = cfg.get("eow", ">")
+
+    def to_disk(self, path, *, exclude=tuple()):
         """Save the current state to a directory.
 
         path (str / Path): A path to a directory, which will be created if
@@ -390,12 +558,14 @@ cdef class Vectors:
                 save_array(self.data, _file)
 
         serializers = {
+            "strings": lambda p: self.strings.to_disk(p.with_suffix(".json")),
             "vectors": lambda p: save_vectors(p),
-            "key2row": lambda p: srsly.write_msgpack(p, self.key2row)
+            "key2row": lambda p: srsly.write_msgpack(p, self.key2row),
+            "vectors.cfg": lambda p: srsly.write_json(p, self._get_cfg()),
         }
-        return util.to_disk(path, serializers, [])
+        return util.to_disk(path, serializers, exclude)
 
-    def from_disk(self, path, **kwargs):
+    def from_disk(self, path, *, exclude=tuple()):
         """Loads state from a directory. Modifies the object in place and
         returns it.
 
@@ -422,17 +592,23 @@ cdef class Vectors:
             if path.exists():
                 self.data = ops.xp.load(str(path))
 
+        def load_settings(path):
+            if path.exists():
+                self._set_cfg(srsly.read_json(path))
+
         serializers = {
+            "strings": lambda p: self.strings.from_disk(p.with_suffix(".json")),
             "vectors": load_vectors,
             "keys": load_keys,
             "key2row": load_key2row,
+            "vectors.cfg": load_settings,
         }
 
-        util.from_disk(path, serializers, [])
+        util.from_disk(path, serializers, exclude)
         self._sync_unset()
         return self
 
-    def to_bytes(self, **kwargs):
+    def to_bytes(self, *, exclude=tuple()):
         """Serialize the current state to a binary string.
 
         exclude (list): String names of serialization fields to exclude.
@@ -447,12 +623,14 @@ cdef class Vectors:
                 return srsly.msgpack_dumps(self.data)
 
         serializers = {
+            "strings": lambda: self.strings.to_bytes(),
             "key2row": lambda: srsly.msgpack_dumps(self.key2row),
-            "vectors": serialize_weights
+            "vectors": serialize_weights,
+            "vectors.cfg": lambda: srsly.json_dumps(self._get_cfg()),
         }
-        return util.to_bytes(serializers, [])
+        return util.to_bytes(serializers, exclude)
 
-    def from_bytes(self, data, **kwargs):
+    def from_bytes(self, data, *, exclude=tuple()):
         """Load state from a binary string.
 
         data (bytes): The data to load from.
@@ -469,13 +647,25 @@ cdef class Vectors:
                 self.data = xp.asarray(srsly.msgpack_loads(b))
 
         deserializers = {
+            "strings": lambda b: self.strings.from_bytes(b),
             "key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)),
-            "vectors": deserialize_weights
+            "vectors": deserialize_weights,
+            "vectors.cfg": lambda b: self._set_cfg(srsly.json_loads(b))
         }
-        util.from_bytes(data, deserializers, [])
+        util.from_bytes(data, deserializers, exclude)
         self._sync_unset()
         return self
 
+    def clear(self):
+        """Clear all entries in the vector table.
+
+        DOCS: https://spacy.io/api/vectors#clear
+        """
+        if self.mode == Mode.floret:
+            raise ValueError(Errors.E859)
+        self.key2row = {}
+        self._sync_unset()
+
     def _sync_unset(self):
         filled = {row for row in self.key2row.values()}
         self._unset = cppset[int]({row for row in range(self.data.shape[0]) if row not in filled})
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index b28ad3e85..9c951b2b7 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -27,7 +27,7 @@ cdef class Vocab:
     cdef Pool mem
     cdef readonly StringStore strings
     cdef public Morphology morphology
-    cdef public object vectors
+    cdef public object _vectors
     cdef public object _lookups
     cdef public object writing_system
     cdef public object get_noun_chunks
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 9840603f5..e2e7ad1db 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -14,7 +14,7 @@ from .attrs cimport LANG, ORTH
 from .compat import copy_reg
 from .errors import Errors
 from .attrs import intify_attrs, NORM, IS_STOP
-from .vectors import Vectors
+from .vectors import Vectors, Mode as VectorsMode
 from .util import registry
 from .lookups import Lookups
 from . import util
@@ -77,11 +77,21 @@ cdef class Vocab:
                 _ = self[string]
         self.lex_attr_getters = lex_attr_getters
         self.morphology = Morphology(self.strings)
-        self.vectors = Vectors(name=vectors_name)
+        self.vectors = Vectors(strings=self.strings, name=vectors_name)
         self.lookups = lookups
         self.writing_system = writing_system
         self.get_noun_chunks = get_noun_chunks
 
+    property vectors:
+        def __get__(self):
+            return self._vectors
+
+        def __set__(self, vectors):
+            for s in vectors.strings:
+                self.strings.add(s)
+            self._vectors = vectors
+            self._vectors.strings = self.strings
+
     @property
     def lang(self):
         langfunc = None
@@ -282,10 +292,10 @@ cdef class Vocab:
         if width is not None and shape is not None:
             raise ValueError(Errors.E065.format(width=width, shape=shape))
         elif shape is not None:
-            self.vectors = Vectors(shape=shape)
+            self.vectors = Vectors(strings=self.strings, shape=shape)
         else:
             width = width if width is not None else self.vectors.data.shape[1]
-            self.vectors = Vectors(shape=(self.vectors.shape[0], width))
+            self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
 
     def prune_vectors(self, nr_row, batch_size=1024):
         """Reduce the current vector table to `nr_row` unique entries. Words
@@ -314,6 +324,8 @@ cdef class Vocab:
 
         DOCS: https://spacy.io/api/vocab#prune_vectors
         """
+        if self.vectors.mode != VectorsMode.default:
+            raise ValueError(Errors.E866)
         ops = get_current_ops()
         xp = get_array_module(self.vectors.data)
         # Make sure all vectors are in the vocab
@@ -328,7 +340,7 @@ cdef class Vocab:
         keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
         keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
         toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
-        self.vectors = Vectors(data=keep, keys=keys[:nr_row], name=self.vectors.name)
+        self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name)
         syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
         syn_keys = ops.to_numpy(syn_keys)
         remap = {}
@@ -340,19 +352,12 @@ cdef class Vocab:
             remap[word] = (synonym, score)
         return remap
 
-    def get_vector(self, orth, minn=None, maxn=None):
+    def get_vector(self, orth):
         """Retrieve a vector for a word in the vocabulary. Words can be looked
         up by string or int ID. If no vectors data is loaded, ValueError is
         raised.
 
-        If `minn` is defined, then the resulting vector uses Fasttext's
-        subword features by average over ngrams of `orth`.
-
-        orth (int / str): The hash value of a word, or its unicode string.
-        minn (int): Minimum n-gram length used for Fasttext's ngram computation.
-            Defaults to the length of `orth`.
-        maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
-            Defaults to the length of `orth`.
+        orth (int / unicode): The hash value of a word, or its unicode string.
         RETURNS (numpy.ndarray or cupy.ndarray): A word vector. Size
             and shape determined by the `vocab.vectors` instance. Usually, a
             numpy ndarray of shape (300,) and dtype float32.
@@ -361,40 +366,10 @@ cdef class Vocab:
         """
         if isinstance(orth, str):
             orth = self.strings.add(orth)
-        word = self[orth].orth_
-        if orth in self.vectors.key2row:
+        if self.has_vector(orth):
             return self.vectors[orth]
         xp = get_array_module(self.vectors.data)
         vectors = xp.zeros((self.vectors_length,), dtype="f")
-        if minn is None:
-            return vectors
-        # Fasttext's ngram computation taken from
-        # https://github.com/facebookresearch/fastText
-        # Assign default ngram limit to maxn which is the length of the word.
-        if maxn is None:
-            maxn = len(word)
-        ngrams_size = 0;
-        for i in range(len(word)):
-            ngram = ""
-            if (word[i] and 0xC0) == 0x80:
-                continue
-            n = 1
-            j = i
-            while (j < len(word) and n <= maxn):
-                if n > maxn:
-                    break
-                ngram += word[j]
-                j = j + 1
-                while (j < len(word) and (word[j] and 0xC0) == 0x80):
-                    ngram += word[j]
-                    j = j + 1
-                if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))):
-                    if self.strings[ngram] in self.vectors.key2row:
-                        vectors = xp.add(self.vectors[self.strings[ngram]], vectors)
-                        ngrams_size += 1
-                n = n + 1
-        if ngrams_size > 0:
-            vectors = vectors * (1.0/ngrams_size)
         return vectors
 
     def set_vector(self, orth, vector):
@@ -417,7 +392,8 @@ cdef class Vocab:
             self.vectors.resize((new_rows, width))
         lex = self[orth]  # Add word to vocab if necessary
         row = self.vectors.add(orth, vector=vector)
-        lex.rank = row
+        if row >= 0:
+            lex.rank = row
 
     def has_vector(self, orth):
         """Check whether a word has a vector. Returns False if no vectors have
@@ -461,7 +437,7 @@ cdef class Vocab:
         if "strings" not in exclude:
             self.strings.to_disk(path / "strings.json")
         if "vectors" not in "exclude":
-            self.vectors.to_disk(path)
+            self.vectors.to_disk(path, exclude=["strings"])
         if "lookups" not in "exclude":
             self.lookups.to_disk(path)
 
@@ -504,7 +480,7 @@ cdef class Vocab:
             if self.vectors is None:
                 return None
             else:
-                return self.vectors.to_bytes()
+                return self.vectors.to_bytes(exclude=["strings"])
 
         getters = {
             "strings": lambda: self.strings.to_bytes(),
@@ -526,7 +502,7 @@ cdef class Vocab:
             if self.vectors is None:
                 return None
             else:
-                return self.vectors.from_bytes(b)
+                return self.vectors.from_bytes(b, exclude=["strings"])
 
         setters = {
             "strings": lambda b: self.strings.from_bytes(b),
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 470d11a3a..337c6df09 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -208,6 +208,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
+| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~                                                                                                                                                   |
 | `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
 | `--verbose`, `-V`  | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
 | `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index 1a7f7a3f5..84d2c00ad 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -8,15 +8,30 @@ new: 2
 
 Vectors data is kept in the `Vectors.data` attribute, which should be an
 instance of `numpy.ndarray` (for CPU vectors) or `cupy.ndarray` (for GPU
-vectors). Multiple keys can be mapped to the same vector, and not all of the
-rows in the table need to be assigned – so `vectors.n_keys` may be greater or
-smaller than `vectors.shape[0]`.
+vectors).
+
+As of spaCy v3.2, `Vectors` supports two types of vector tables:
+
+- `default`: A standard vector table (as in spaCy v3.1 and earlier) where each
+  key is mapped to one row in the vector table. Multiple keys can be mapped to
+  the same vector, and not all of the rows in the table need to be assigned – so
+  `vectors.n_keys` may be greater or smaller than `vectors.shape[0]`.
+- `floret`: Only supports vectors trained with
+  [floret](https://github.com/explosion/floret), an extended version of
+  [fastText](https://fasttext.cc) that produces compact vector tables by
+  combining fastText's subword ngrams with Bloom embeddings. The compact tables
+  are similar to the [`HashEmbed`](https://thinc.ai/docs/api-layers#hashembed)
+  embeddings already used in many spaCy components. Each word is represented as
+  the sum of one or more rows as determined by the settings related to character
+  ngrams and the hash table.
 
 ## Vectors.\_\_init\_\_ {#init tag="method"}
 
-Create a new vector store. You can set the vector values and keys directly on
-initialization, or supply a `shape` keyword argument to create an empty table
-you can add vectors to later.
+Create a new vector store. With the default mode, you can set the vector values
+and keys directly on initialization, or supply a `shape` keyword argument to
+create an empty table you can add vectors to later. In floret mode, the complete
+vector data and settings must be provided on initialization and cannot be
+modified later.
 
 > #### Example
 >
@@ -30,13 +45,21 @@ you can add vectors to later.
 > vectors = Vectors(data=data, keys=keys)
 > ```
 
-| Name           | Description                                                                                                                                                                            |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| _keyword-only_ |                                                                                                                                                                                        |
-| `shape`        | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
-| `data`         | The vector data. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                              |
-| `keys`         | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~                                                                                                                |
-| `name`         | A name to identify the vectors table. ~~str~~                                                                                                                                          |
+| Name                                      | Description                                                                                                                                                                            |
+| ----------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| _keyword-only_                            |                                                                                                                                                                                        |
+| `strings`                                 | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~                                                                  |
+| `shape`                                   | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
+| `data`                                    | The vector data. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                              |
+| `keys`                                    | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~                                                                                                                |
+| `name`                                    | A name to identify the vectors table. ~~str~~                                                                                                                                          |
+| `mode` <Tag variant="new">3.2</Tag>       | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~                                                                         |
+| `minn` <Tag variant="new">3.2</Tag>       | The floret char ngram minn (default: `0`). ~~int~~                                                                                                                                     |
+| `maxn` <Tag variant="new">3.2</Tag>       | The floret char ngram maxn (default: `0`). ~~int~~                                                                                                                                     |
+| `hash_count` <Tag variant="new">3.2</Tag> | The floret hash count. Supported values: 1--4 (default: `1`). ~~int~~                                                                                                                  |
+| `hash_seed` <Tag variant="new">3.2</Tag>  | The floret hash seed (default: `0`). ~~int~~                                                                                                                                           |
+| `bow` <Tag variant="new">3.2</Tag>        | The floret BOW string (default: `"<"`). ~~str~~                                                                                                                                        |
+| `eow` <Tag variant="new">3.2</Tag>        | The floret EOW string (default: `">"`). ~~str~~                                                                                                                                        |
 
 ## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
 
@@ -53,12 +76,12 @@ raised.
 
 | Name        | Description                                                      |
 | ----------- | ---------------------------------------------------------------- |
-| `key`       | The key to get the vector for. ~~int~~                           |
+| `key`       | The key to get the vector for. ~~Union[int, str]~~               |
 | **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
 
 ## Vectors.\_\_setitem\_\_ {#setitem tag="method"}
 
-Set a vector for the given key.
+Set a vector for the given key. Not supported for `floret` mode.
 
 > #### Example
 >
@@ -75,7 +98,8 @@ Set a vector for the given key.
 
 ## Vectors.\_\_iter\_\_ {#iter tag="method"}
 
-Iterate over the keys in the table.
+Iterate over the keys in the table. In `floret` mode, the keys table is not
+used.
 
 > #### Example
 >
@@ -105,7 +129,8 @@ Return the number of vectors in the table.
 
 ## Vectors.\_\_contains\_\_ {#contains tag="method"}
 
-Check whether a key has been mapped to a vector entry in the table.
+Check whether a key has been mapped to a vector entry in the table. In `floret`
+mode, returns `True` for all keys.
 
 > #### Example
 >
@@ -123,11 +148,8 @@ Check whether a key has been mapped to a vector entry in the table.
 ## Vectors.add {#add tag="method"}
 
 Add a key to the table, optionally setting a vector value as well. Keys can be
-mapped to an existing vector by setting `row`, or a new vector can be added.
-When adding string keys, keep in mind that the `Vectors` class itself has no
-[`StringStore`](/api/stringstore), so you have to store the hash-to-string
-mapping separately. If you need to manage the strings, you should use the
-`Vectors` via the [`Vocab`](/api/vocab) class, e.g. `vocab.vectors`.
+mapped to an existing vector by setting `row`, or a new vector can be added. Not
+supported for `floret` mode.
 
 > #### Example
 >
@@ -152,7 +174,8 @@ Resize the underlying vectors array. If `inplace=True`, the memory is
 reallocated. This may cause other references to the data to become invalid, so
 only use `inplace=True` if you're sure that's what you want. If the number of
 vectors is reduced, keys mapped to rows that have been deleted are removed.
-These removed items are returned as a list of `(key, row)` tuples.
+These removed items are returned as a list of `(key, row)` tuples. Not supported
+for `floret` mode.
 
 > #### Example
 >
@@ -168,7 +191,8 @@ These removed items are returned as a list of `(key, row)` tuples.
 
 ## Vectors.keys {#keys tag="method"}
 
-A sequence of the keys in the table.
+A sequence of the keys in the table. In `floret` mode, the keys table is not
+used.
 
 > #### Example
 >
@@ -185,7 +209,7 @@ A sequence of the keys in the table.
 
 Iterate over vectors that have been assigned to at least one key. Note that some
 vectors may be unassigned, so the number of vectors returned may be less than
-the length of the vectors table.
+the length of the vectors table. In `floret` mode, the keys table is not used.
 
 > #### Example
 >
@@ -200,7 +224,8 @@ the length of the vectors table.
 
 ## Vectors.items {#items tag="method"}
 
-Iterate over `(key, vector)` pairs, in order.
+Iterate over `(key, vector)` pairs, in order. In `floret` mode, the keys table
+is empty.
 
 > #### Example
 >
@@ -215,7 +240,7 @@ Iterate over `(key, vector)` pairs, in order.
 
 ## Vectors.find {#find tag="method"}
 
-Look up one or more keys by row, or vice versa.
+Look up one or more keys by row, or vice versa. Not supported for `floret` mode.
 
 > #### Example
 >
@@ -273,7 +298,8 @@ The vector size, i.e. `rows * dims`.
 
 Whether the vectors table is full and has no slots are available for new keys.
 If a table is full, it can be resized using
-[`Vectors.resize`](/api/vectors#resize).
+[`Vectors.resize`](/api/vectors#resize). In `floret` mode, the table is always
+full and cannot be resized.
 
 > #### Example
 >
@@ -291,7 +317,7 @@ If a table is full, it can be resized using
 
 Get the number of keys in the table. Note that this is the number of _all_ keys,
 not just unique vectors. If several keys are mapped to the same vectors, they
-will be counted individually.
+will be counted individually. In `floret` mode, the keys table is not used.
 
 > #### Example
 >
@@ -311,7 +337,8 @@ For each of the given vectors, find the `n` most similar entries to it by
 cosine. Queries are by vector. Results are returned as a
 `(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are
 performed in chunks to avoid consuming too much memory. You can set the
-`batch_size` to control the size/space trade-off during the calculations.
+`batch_size` to control the size/space trade-off during the calculations. Not
+supported for `floret` mode.
 
 > #### Example
 >
@@ -329,6 +356,21 @@ performed in chunks to avoid consuming too much memory. You can set the
 | `sort`         | Whether to sort the entries returned by score. Defaults to `True`. ~~bool~~ |
 | **RETURNS**    | tuple                                                                       | The most similar entries as a `(keys, best_rows, scores)` tuple. ~~Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]~~ |
 
+## Vectors.get_batch {#get_batch tag="method" new="3.2"}
+
+Get the vectors for the provided keys efficiently as a batch.
+
+> #### Example
+>
+> ```python
+> words = ["cat", "dog"]
+> vectors = nlp.vocab.vectors.get_batch(words)
+> ```
+
+| Name   | Description                             |
+| ------ | --------------------------------------- |
+| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
+
 ## Vectors.to_disk {#to_disk tag="method"}
 
 Save the current state to a directory.

From 554fa414ecf7e6b74aeed6af70bd74346e7abf31 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 28 Oct 2021 11:18:19 +0200
Subject: [PATCH 036/133] Require spacy-transformers v1.1 in transformers extra
 (#9557)

So that the install/upgrade quickstart also upgrades
`spacy-transformers` with `pip install spacy[transformers]`, require
`spacy-transformers>=1.1.2` in the `transformers` extra.
---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index d007fb160..5431d39e0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -72,7 +72,7 @@ console_scripts =
 lookups =
     spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
-    spacy_transformers>=1.0.1,<1.2.0
+    spacy_transformers>=1.1.2,<1.2.0
 ray =
     spacy_ray>=0.1.0,<1.0.0
 cuda =

From 12974bf4d975a8789f86ba91c7cefc31518ed9e4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 29 Oct 2021 10:29:29 +0200
Subject: [PATCH 037/133] Add micro PRF for morph scoring (#9546)

* Add micro PRF for morph scoring

For pipelines where morph features are added by more than one component
and a reference training corpus may not contain all features, a micro
PRF score is more flexible than a simple accuracy score. An example is
the reading and inflection features added by the Japanese tokenizer.

* Use `morph_micro_f` as the default morph score for Japanese
morphologizers.

* Update docstring

* Fix typo in docstring

* Update Scorer API docs

* Fix results type

* Organize score list by attribute prefix
---
 spacy/lang/ja/__init__.py  |  2 +-
 spacy/scorer.py            | 30 ++++++++++++++++++++----------
 spacy/tests/test_scorer.py |  2 ++
 website/docs/api/scorer.md | 33 +++++++++++++++++++--------------
 4 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 127c4c8ac..81ff5b5b8 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -203,7 +203,7 @@ class Japanese(Language):
         "extend": True,
         "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
     },
-    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
+    default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
     nlp: Language,
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 49d51a4b3..75e5b3317 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -247,18 +247,21 @@ class Scorer:
         missing_values: Set[Any] = MISSING_VALUES,  # type: ignore[assignment]
         **cfg,
     ) -> Dict[str, Any]:
-        """Return PRF scores per feat for a token attribute in UFEATS format.
+        """Return micro PRF and PRF scores per feat for a token attribute in
+        UFEATS format.
 
         examples (Iterable[Example]): Examples to score
         attr (str): The attribute to score.
         getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
             getter(token, attr) should return the value of the attribute for an
             individual token.
-        missing_values (Set[Any]): Attribute values to treat as missing annotation
-            in the reference annotation.
-        RETURNS (dict): A dictionary containing the per-feat PRF scores under
-            the key attr_per_feat.
+        missing_values (Set[Any]): Attribute values to treat as missing
+            annotation in the reference annotation.
+        RETURNS (dict): A dictionary containing the micro PRF scores under the
+            key attr_micro_p/r/f and the per-feat PRF scores under
+            attr_per_feat.
         """
+        micro_score = PRFScore()
         per_feat = {}
         for example in examples:
             pred_doc = example.predicted
@@ -300,15 +303,22 @@ class Scorer:
                                     pred_per_feat[field] = set()
                                 pred_per_feat[field].add((gold_i, feat))
             for field in per_feat:
+                micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
                 per_feat[field].score_set(
                     pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
                 )
-        score_key = f"{attr}_per_feat"
-        if any([len(v) for v in per_feat.values()]):
-            result = {k: v.to_dict() for k, v in per_feat.items()}
-            return {score_key: result}
+        result: Dict[str, Any] = {}
+        if len(micro_score) > 0:
+            result[f"{attr}_micro_p"] = micro_score.precision
+            result[f"{attr}_micro_r"] = micro_score.recall
+            result[f"{attr}_micro_f"] = micro_score.fscore
+            result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
         else:
-            return {score_key: None}
+            result[f"{attr}_micro_p"] = None
+            result[f"{attr}_micro_r"] = None
+            result[f"{attr}_micro_f"] = None
+            result[f"{attr}_per_feat"] = None
+        return result
 
     @staticmethod
     def score_spans(
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 16cc97f6d..6e15fa2de 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -249,6 +249,7 @@ def test_tag_score(tagged_doc):
     assert results["tag_acc"] == 1.0
     assert results["pos_acc"] == 1.0
     assert results["morph_acc"] == 1.0
+    assert results["morph_micro_f"] == 1.0
     assert results["morph_per_feat"]["NounType"]["f"] == 1.0
 
     # Gold annotation is modified
@@ -272,6 +273,7 @@ def test_tag_score(tagged_doc):
     assert results["tag_acc"] == 0.9
     assert results["pos_acc"] == 0.9
     assert results["morph_acc"] == approx(0.8)
+    assert results["morph_micro_f"] == approx(0.8461538)
     assert results["morph_per_feat"]["NounType"]["f"] == 1.0
     assert results["morph_per_feat"]["Poss"]["f"] == 0.0
     assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index da7da5f82..8dbe3b276 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -41,15 +41,20 @@ Calculate the scores for a list of [`Example`](/api/example) objects using the
 scoring methods provided by the components in the pipeline.
 
 The returned `Dict` contains the scores provided by the individual pipeline
-components. For the scoring methods provided by the `Scorer` and use by the core
-pipeline components, the individual score names start with the `Token` or `Doc`
-attribute being scored:
+components. For the scoring methods provided by the `Scorer` and used by the
+core pipeline components, the individual score names start with the `Token` or
+`Doc` attribute being scored:
 
-- `token_acc`, `token_p`, `token_r`, `token_f`,
+- `token_acc`, `token_p`, `token_r`, `token_f`
 - `sents_p`, `sents_r`, `sents_f`
-- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`
+- `tag_acc`
+- `pos_acc`
+- `morph_acc`, `morph_micro_p`, `morph_micro_r`, `morph_micro_f`,
+  `morph_per_feat`
+- `lemma_acc`
 - `dep_uas`, `dep_las`, `dep_las_per_type`
 - `ents_p`, `ents_r` `ents_f`, `ents_per_type`
+- `spans_sc_p`, `spans_sc_r`, `spans_sc_f`
 - `cats_score` (depends on config, description provided in `cats_score_desc`),
   `cats_micro_p`, `cats_micro_r`, `cats_micro_f`, `cats_macro_p`,
   `cats_macro_r`, `cats_macro_f`, `cats_macro_auc`, `cats_f_per_type`,
@@ -84,7 +89,7 @@ Docs with `has_unknown_spaces` are skipped during scoring.
 > ```
 
 | Name        | Description                                                                                                         |
-| ----------- | ------------------------------------------------------------------------------------------------------------------- |
+| ----------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
 | `examples`  | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
 | **RETURNS** | `Dict`                                                                                                              | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ |
 
@@ -124,14 +129,14 @@ scoring.
 > print(scores["morph_per_feat"])
 > ```
 
-| Name             | Description                                                                                                                                                   |
-| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           |
-| `attr`           | The attribute to score. ~~str~~                                                                                                                               |
-| _keyword-only_   |                                                                                                                                                               |
-| `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
-| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        |
-| **RETURNS**      | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~                                           |
+| Name             | Description                                                                                                                                                             |
+| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                     |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                                         |
+| _keyword-only_   |                                                                                                                                                                         |
+| `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~           |
+| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                                  |
+| **RETURNS**      | A dictionary containing the micro PRF scores under the key `{attr}_micro_p/r/f` and the per-feature PRF scores under `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
 
 ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
 

From 322635e3711f0d496720a8769c6e4f135016c7db Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 29 Oct 2021 15:22:40 +0200
Subject: [PATCH 038/133] Set version to v3.2.0 (#9565)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index f6043353e..29f78805c 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.2.0.dev0"
+__version__ = "3.2.0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From bb26550e22af3e1cfc7f66dd54a607cf50822e84 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 29 Oct 2021 16:25:43 +0200
Subject: [PATCH 039/133] Fix StaticVectors after floret+mypy merge (#9566)

---
 spacy/ml/staticvectors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 741007bdb..8dd65833b 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -77,7 +77,7 @@ def forward(
         model.inc_grad(
             "W",
             model.ops.gemm(
-                cast(Floats2d, d_output.data), model.ops.as_contig(V[rows]), trans1=True
+                cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True
             ),
         )
         return []

From 5279c7c4baf5b9f26a364f07551acecf26a2d42f Mon Sep 17 00:00:00 2001
From: Vasundhara <15020857+vgautam@users.noreply.github.com>
Date: Sat, 30 Oct 2021 21:44:29 -0700
Subject: [PATCH 040/133] Fix broken link to mappings-exceptions (#9573)

---
 website/docs/usage/linguistic-features.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index f8f47ab53..f748fa8d6 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -105,7 +105,7 @@ coarse-grained part-of-speech tags and morphological features.
    that the verb is past tense (e.g. `VBD` for a past tense verb in the Penn
    Treebank) .
 2. For words whose coarse-grained POS is not set by a prior process, a
-   [mapping table](#mapping-exceptions) maps the fine-grained tags to a
+   [mapping table](#mappings-exceptions) maps the fine-grained tags to a
    coarse-grained POS tags and morphological features.
 
 ```python

From a4dcb68cf69f4178778d9c5d6b0e478ecff0b00e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bruce=20W=2E=20Lee=20=28=EC=9D=B4=EC=9B=85=EC=84=B1=29?=
 <brucelws@seas.upenn.edu>
Date: Mon, 1 Nov 2021 18:38:14 +0900
Subject: [PATCH 041/133] Adding LingFeat Software to spaCy Universe. (#9574)

* add lingfeat in universe

* add lingfeat in universe

* Fix JSON

* Minor cleanup

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 website/meta/universe.json | 59 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 80608c77d..b31a52cb8 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -3343,6 +3343,65 @@
             "category": ["research", "standalone", "scientific"],
             "tags": ["Text Analytics", "Coherence", "Cohesion"]
         },
+        {
+            "id": "lingfeat",
+            "title": "LingFeat",
+            "slogan": "A Linguistic Feature Extraction (Text Analysis) Tool for Readability Assessment and Text Simplification",
+            "description": "LingFeat is a feature extraction library which currently extracts 255 linguistic features from English string input. Categories include syntax, semantics, discourse, and also traditional readability formulas. Published in EMNLP 2021.",
+            "github": "brucewlee/lingfeat",
+            "pip": "lingfeat",
+            "code_example": [
+                "from lingfeat import extractor",
+                "",
+                "",
+                "text = 'TAEAN, South Chungcheong Province -- Just before sunup, Lee Young-ho, a seasoned fisherman with over 30 years of experience, silently waits for boats carrying blue crabs as the season for the seafood reaches its height. Soon afterward, small and big boats sail into Sinjin Port in Taean County, South Chungcheong Province, the second-largest source of blue crab after Incheon, accounting for 29 percent of total production of the country. A crane lifts 28 boxes filled with blue crabs weighing 40 kilograms each from the boat, worth about 10 million won ($8,500). “It has been a productive fall season for crabbing here. The water temperature is a very important factor affecting crab production. They hate cold water,” Lee said. The temperature of the sea off Taean appeared to have stayed at the level where crabs become active. If the sea temperature suddenly drops, crabs go into their winter dormancy mode, burrowing into the mud and sleeping through the cold months.'",
+                "",
+                "",
+                "#Pass text",
+                "LingFeat = extractor.pass_text(text)",
+                "",
+                "",
+                "#Preprocess text",
+                "LingFeat.preprocess()",
+                "",
+                "",
+                "#Extract features",
+                "#each method returns a dictionary of the corresponding features",
+                "#Advanced Semantic (AdSem) Features",
+                "WoKF = LingFeat.WoKF_() #Wikipedia Knowledge Features",
+                "WBKF = LingFeat.WBKF_() #WeeBit Corpus Knowledge Features",
+                "OSKF = LingFeat.OSKF_() #OneStopEng Corpus Knowledge Features",
+                "",
+                "#Discourse (Disco) Features",
+                "EnDF = LingFeat.EnDF_() #Entity Density Features",
+                "EnGF = LingFeat.EnGF_() #Entity Grid Features",
+                "",
+                "#Syntactic (Synta) Features",
+                "PhrF = LingFeat.PhrF_() #Noun/Verb/Adj/Adv/... Phrasal Features",
+                "TrSF = LingFeat.TrSF_() #(Parse) Tree Structural Features",
+                "POSF = LingFeat.POSF_() #Noun/Verb/Adj/Adv/... Part-of-Speech Features",
+                "",
+                "#Lexico Semantic (LxSem) Features",
+                "TTRF = LingFeat.TTRF_() #Type Token Ratio Features",
+                "VarF = LingFeat.VarF_() #Noun/Verb/Adj/Adv Variation Features", 
+                "PsyF = LingFeat.PsyF_() #Psycholinguistic Difficulty of Words (AoA Kuperman)",
+                "WoLF = LingFeat.WorF_() #Word Familiarity from Frequency Count (SubtlexUS)",
+                "",
+                "Shallow Traditional (ShTra) Features",
+                "ShaF = LingFeat.ShaF_() #Shallow Features (e.g. avg number of tokens)",
+                "TraF = LingFeat.TraF_() #Traditional Formulas"
+            ],
+            "code_language": "python",
+            "thumb": "https://raw.githubusercontent.com/brucewlee/lingfeat/master/img/lingfeat_logo2.png",
+            "image": "https://raw.githubusercontent.com/brucewlee/lingfeat/master/img/lingfeat_logo.png",
+            "author": "Bruce W. Lee (이웅성)",
+            "author_links": {
+                "github": "brucewlee",
+                "website": "https://brucewlee.github.io/"
+            },
+            "category": ["research", "scientific"],
+            "tags": ["Readability", "Simplification", "Feature Extraction", "Syntax", "Discourse", "Semantics", "Lexical"]
+        },
         {
             "id": "hmrb",
             "title": "Hammurabi",

From 90ec820f05a7e173959c5c71f3d244bf969a769b Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Mon, 1 Nov 2021 17:38:41 +0800
Subject: [PATCH 042/133] Add WordDumb to spaCy Universe (#9572)

* Add WordDumb to spaCy Universe

* Add standalone category

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 website/meta/universe.json | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index b31a52cb8..9b7484a13 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -3577,7 +3577,22 @@
             },
             "category": ["pipeline", "research", "standalone"],
             "tags": ["spacy", "python", "nlp", "ner"]
-        }        
+        },
+        {
+            "id": "WordDumb",
+            "title": "WordDumb",
+            "slogan": "A calibre plugin that generates Word Wise and X-Ray files.",
+            "description": "A calibre plugin that generates Word Wise and X-Ray files then sends them to Kindle. Supports KFX, AZW3 and MOBI eBooks. X-Ray supports 18 languages.",
+            "github": "xxyzz/WordDumb",
+            "code_language": "python",
+            "thumb": "https://raw.githubusercontent.com/xxyzz/WordDumb/master/starfish.svg",
+            "image": "https://user-images.githubusercontent.com/21101839/130245435-b874f19a-7785-4093-9975-81596efc42bb.png",
+            "author": "xxyzz",
+            "author_links": {
+                "github": "xxyzz"
+            },
+            "category": ["standalone"]
+        }
     ],
 
     "categories": [

From f1bc655a387a040bbd64d8650eade135215e8a0a Mon Sep 17 00:00:00 2001
From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
Date: Tue, 2 Nov 2021 15:35:49 +0800
Subject: [PATCH 043/133] Add initial Tagalog (tl) tests (#9582)

* Add tl_tokenizer to test fixtures

* Add tagalog tests
---
 spacy/tests/conftest.py             |   5 ++
 spacy/tests/lang/tl/__init__.py     |   0
 spacy/tests/lang/tl/test_indices.py |   8 ++
 spacy/tests/lang/tl/test_punct.py   | 127 ++++++++++++++++++++++++++++
 spacy/tests/lang/tl/test_text.py    |  73 ++++++++++++++++
 5 files changed, 213 insertions(+)
 create mode 100644 spacy/tests/lang/tl/__init__.py
 create mode 100644 spacy/tests/lang/tl/test_indices.py
 create mode 100644 spacy/tests/lang/tl/test_punct.py
 create mode 100644 spacy/tests/lang/tl/test_text.py

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index b88d11f0e..b4819ed5c 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -290,6 +290,11 @@ def ti_tokenizer():
     return get_lang_class("ti")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def tl_tokenizer():
+    return get_lang_class("tl")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def tr_tokenizer():
     return get_lang_class("tr")().tokenizer
diff --git a/spacy/tests/lang/tl/__init__.py b/spacy/tests/lang/tl/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/tl/test_indices.py b/spacy/tests/lang/tl/test_indices.py
new file mode 100644
index 000000000..7c99ae573
--- /dev/null
+++ b/spacy/tests/lang/tl/test_indices.py
@@ -0,0 +1,8 @@
+def test_tl_simple_punct(tl_tokenizer):
+    text = "Sige, punta ka dito"
+    tokens = tl_tokenizer(text)
+    assert tokens[0].idx == 0
+    assert tokens[1].idx == 4
+    assert tokens[2].idx == 6
+    assert tokens[3].idx == 12
+    assert tokens[4].idx == 15
diff --git a/spacy/tests/lang/tl/test_punct.py b/spacy/tests/lang/tl/test_punct.py
new file mode 100644
index 000000000..d6bcf297d
--- /dev/null
+++ b/spacy/tests/lang/tl/test_punct.py
@@ -0,0 +1,127 @@
+import pytest
+from spacy.util import compile_prefix_regex
+from spacy.lang.punctuation import TOKENIZER_PREFIXES
+
+
+PUNCT_OPEN = ["(", "[", "{", "*"]
+PUNCT_CLOSE = [")", "]", "}", "*"]
+PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
+
+
+@pytest.mark.parametrize("text", ["(", "((", "<"])
+def test_tl_tokenizer_handles_only_punct(tl_tokenizer, text):
+    tokens = tl_tokenizer(text)
+    assert len(tokens) == len(text)
+
+
+@pytest.mark.parametrize("punct", PUNCT_OPEN)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_split_open_punct(tl_tokenizer, punct, text):
+    tokens = tl_tokenizer(punct + text)
+    assert len(tokens) == 2
+    assert tokens[0].text == punct
+    assert tokens[1].text == text
+
+
+@pytest.mark.parametrize("punct", PUNCT_CLOSE)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_close_punct(tl_tokenizer, punct, text):
+    tokens = tl_tokenizer(text + punct)
+    assert len(tokens) == 2
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize("punct", PUNCT_OPEN)
+@pytest.mark.parametrize("punct_add", ["`"])
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_two_diff_open_punct(tl_tokenizer, punct, punct_add, text):
+    tokens = tl_tokenizer(punct + punct_add + text)
+    assert len(tokens) == 3
+    assert tokens[0].text == punct
+    assert tokens[1].text == punct_add
+    assert tokens[2].text == text
+
+
+@pytest.mark.parametrize("punct", PUNCT_CLOSE)
+@pytest.mark.parametrize("punct_add", ["`"])
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_two_diff_close_punct(tl_tokenizer, punct, punct_add, text):
+    tokens = tl_tokenizer(text + punct + punct_add)
+    assert len(tokens) == 3
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+    assert tokens[2].text == punct_add
+
+
+@pytest.mark.parametrize("punct", PUNCT_OPEN)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_same_open_punct(tl_tokenizer, punct, text):
+    tokens = tl_tokenizer(punct + punct + punct + text)
+    assert len(tokens) == 4
+    assert tokens[0].text == punct
+    assert tokens[3].text == text
+
+
+@pytest.mark.parametrize("punct", PUNCT_CLOSE)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_same_close_punct(tl_tokenizer, punct, text):
+    tokens = tl_tokenizer(text + punct + punct + punct)
+    assert len(tokens) == 4
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize("text", ["'Ang"])
+def test_tl_tokenizer_splits_open_apostrophe(tl_tokenizer, text):
+    tokens = tl_tokenizer(text)
+    assert len(tokens) == 2
+    assert tokens[0].text == "'"
+
+
+@pytest.mark.parametrize("text", ["Mabuhay''"])
+def test_tl_tokenizer_splits_double_end_quote(tl_tokenizer, text):
+    tokens = tl_tokenizer(text)
+    assert len(tokens) == 2
+    tokens_punct = tl_tokenizer("''")
+    assert len(tokens_punct) == 1
+
+
+@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_open_close_punct(
+    tl_tokenizer, punct_open, punct_close, text
+):
+    tokens = tl_tokenizer(punct_open + text + punct_close)
+    assert len(tokens) == 3
+    assert tokens[0].text == punct_open
+    assert tokens[1].text == text
+    assert tokens[2].text == punct_close
+
+
+@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
+@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_two_diff_punct(
+    tl_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
+):
+    tokens = tl_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
+    assert len(tokens) == 5
+    assert tokens[0].text == punct_open2
+    assert tokens[1].text == punct_open
+    assert tokens[2].text == text
+    assert tokens[3].text == punct_close
+    assert tokens[4].text == punct_close2
+
+
+@pytest.mark.parametrize("text,punct", [("(sa'yo", "(")])
+def test_tl_tokenizer_splits_pre_punct_regex(text, punct):
+    tl_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
+    match = tl_search_prefixes(text)
+    assert match.group() == punct
+
+
+def test_tl_tokenizer_splits_bracket_period(tl_tokenizer):
+    text = "(Dumating siya kahapon)."
+    tokens = tl_tokenizer(text)
+    assert tokens[len(tokens) - 1].text == "."
diff --git a/spacy/tests/lang/tl/test_text.py b/spacy/tests/lang/tl/test_text.py
new file mode 100644
index 000000000..17429617c
--- /dev/null
+++ b/spacy/tests/lang/tl/test_text.py
@@ -0,0 +1,73 @@
+import pytest
+from spacy.lang.tl.lex_attrs import like_num
+
+# https://github.com/explosion/spaCy/blob/master/spacy/tests/lang/en/test_text.py
+
+
+def test_tl_tokenizer_handles_long_text(tl_tokenizer):
+    # Excerpt: "Sapagkat ang Pilosopiya ay Ginagawa" by Padre Roque Ferriols
+    text = """
+    Tingin tayo nang tingin. Kailangan lamang nating dumilat at
+    marami tayong makikita. At ang pagtingin ay isang gawain na ako lamang ang
+    makagagawa, kung ako nga ang makakita. Kahit na napanood na ng aking
+    matalik na kaibigan ang isang sine, kailangan ko pa ring panoorin, kung
+    ako nga ang may gustong makakita. Kahit na gaano kadikit ang aming
+    pagkabuklod, hindi siya maaaring tumingin sa isang paraan na ako ang
+    nakakakita. Kung ako ang makakita, ako lamang ang makatitingin.
+    """
+    tokens = tl_tokenizer(text)
+    assert len(tokens) == 97
+
+
+@pytest.mark.parametrize(
+    "text,length",
+    [
+        ("Huwag mo nang itanong sa akin.", 7),
+        ("Nasubukan mo na bang hulihin ang hangin?", 8),
+        ("Hindi ba?", 3),
+        ("Nagbukas ang DFA ng 1,000 appointment slots para sa pasaporte.", 11),
+        ("'Wala raw pasok bukas kasi may bagyo!' sabi ni Micah.", 14),
+        ("'Ingat,' aniya. 'Maingay sila pag malayo at tahimik kung malapit.'", 17),
+    ],
+)
+def test_tl_tokenizer_handles_cnts(tl_tokenizer, text, length):
+    tokens = tl_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("isa", True),
+        ("dalawa", True),
+        ("tatlumpu", True),
+        pytest.param(
+            "isang daan",
+            True,
+            marks=pytest.mark.xfail(reason="Not yet implemented (means 100)"),
+        ),
+        pytest.param(
+            "kalahati",
+            True,
+            marks=pytest.mark.xfail(reason="Not yet implemented (means 1/2)"),
+        ),
+        pytest.param(
+            "isa't kalahati",
+            True,
+            marks=pytest.mark.xfail(
+                reason="Not yet implemented (means one-and-a-half)"
+            ),
+        ),
+    ],
+)
+def test_lex_attrs_like_number(tl_tokenizer, text, match):
+    tokens = tl_tokenizer(text)
+    assert all([token.like_num for token in tokens]) == match
+
+
+@pytest.mark.xfail(reason="Not yet implemented, fails when capitalized.")
+@pytest.mark.parametrize("word", ["isa", "dalawa", "tatlo"])
+def test_tl_lex_attrs_capitals(word):
+    assert like_num(word)
+    assert like_num(word.upper())

From 667572adca3a95cbcfd92cea43746b2625a40831 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 2 Nov 2021 14:10:48 +0100
Subject: [PATCH 044/133] Temporarily skip compat tests (#9594)

---
 spacy/tests/test_cli.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 3243d426b..00ae2c056 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -492,6 +492,7 @@ def test_string_to_list_intify(value):
     assert string_to_list(value, intify=True) == [1, 2, 3]
 
 
+@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
 def test_download_compatibility():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False
@@ -502,6 +503,7 @@ def test_download_compatibility():
         assert get_minor_version(about.__version__) == get_minor_version(version)
 
 
+@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
 def test_validate_compatibility_table():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False

From 4d5db737e9fb815a24b2d071345392185e44abf2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 2 Nov 2021 14:24:06 +0100
Subject: [PATCH 045/133] Revert "Temporarily skip compat tests (#9594)"

This reverts commit 667572adca3a95cbcfd92cea43746b2625a40831.
---
 spacy/tests/test_cli.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 00ae2c056..3243d426b 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -492,7 +492,6 @@ def test_string_to_list_intify(value):
     assert string_to_list(value, intify=True) == [1, 2, 3]
 
 
-@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
 def test_download_compatibility():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False
@@ -503,7 +502,6 @@ def test_download_compatibility():
         assert get_minor_version(about.__version__) == get_minor_version(version)
 
 
-@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
 def test_validate_compatibility_table():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False

From 53a3523910704e6d6801e8ba474b6a3b4db63a57 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 2 Nov 2021 14:24:54 +0100
Subject: [PATCH 046/133] Revert "Temporarily ignore W095 in assemble CLI CI
 test (#9460)"

This reverts commit 8db574e0b55d00196be50eebc56775a2854c8795.
---
 .github/azure-steps.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 8df593bb7..3460bccea 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -98,8 +98,7 @@ steps:
 
   - script: |
       python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      # temporarily ignore W095
-      PYTHONWARNINGS="error,ignore:[W095]:UserWarning,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
     displayName: 'Test assemble CLI'
     condition: eq(variables['python_version'], '3.8')
 

From c155f333bb69c6c146224b3295d89c394114aa04 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 2 Nov 2021 14:25:05 +0100
Subject: [PATCH 047/133] Revert "Temporarily use v3.1.0 models in CI"

This reverts commit bd6433bbab66de70086e6df9130aab25cca4fe7e.
---
 .github/azure-steps.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 3460bccea..80c88b0b8 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -65,11 +65,8 @@ steps:
     condition: eq(${{ parameters.gpu }}, true)
 
   - script: |
-      #python -m spacy download ca_core_news_sm
-      #python -m spacy download ca_core_news_md
-      # temporarily install the v3.1.0 models
-      pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.1.0/ca_core_news_sm-3.1.0-py3-none-any.whl
-      pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.1.0/ca_core_news_md-3.1.0-py3-none-any.whl
+      python -m spacy download ca_core_news_sm
+      python -m spacy download ca_core_news_md
       python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
     displayName: 'Test download CLI'
     condition: eq(variables['python_version'], '3.8')

From 5a979137a710ac19ceac0fb1c4a659a8289682a0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 2 Nov 2021 15:08:22 +0100
Subject: [PATCH 048/133] Set as_tuples on Doc during processing (#9592)

* Set as_tuples on Doc during processing

* Fix types

* Format
---
 spacy/language.py    | 26 ++++++++++++++++++--------
 spacy/tokens/doc.pxd |  2 +-
 spacy/tokens/doc.pyi |  1 +
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 64d5d5a8a..49f6dd1a5 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1091,6 +1091,12 @@ class Language:
             return self.make_doc(doc_like)
         raise ValueError(Errors.E866.format(type=type(doc_like)))
 
+    def _ensure_doc_with_context(self, doc_like: Union[str, Doc], context: Any) -> Doc:
+        """Create a Doc if need be and add as_tuples context, or raise an error if the input is not a Doc or a string."""
+        doc = self._ensure_doc(doc_like)
+        doc._context = context
+        return doc
+
     def update(
         self,
         examples: Iterable[Example],
@@ -1474,7 +1480,7 @@ class Language:
     @overload
     def pipe(  # noqa: F811
         self,
-        texts: Iterable[Tuple[str, _AnyContext]],
+        texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
         *,
         as_tuples: Literal[True] = ...,
         batch_size: Optional[int] = ...,
@@ -1486,7 +1492,9 @@ class Language:
 
     def pipe(  # noqa: F811
         self,
-        texts: Union[Iterable[Union[str, Doc]], Iterable[Tuple[str, _AnyContext]]],
+        texts: Union[
+            Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
+        ],
         *,
         as_tuples: bool = False,
         batch_size: Optional[int] = None,
@@ -1512,18 +1520,20 @@ class Language:
         """
         # Handle texts with context as tuples
         if as_tuples:
-            texts = cast(Iterable[Tuple[str, _AnyContext]], texts)
-            text_context1, text_context2 = itertools.tee(texts)
-            texts = (tc[0] for tc in text_context1)
-            contexts = (tc[1] for tc in text_context2)
+            texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
+            docs_with_contexts = (
+                self._ensure_doc_with_context(text, context) for text, context in texts
+            )
             docs = self.pipe(
-                texts,
+                docs_with_contexts,
                 batch_size=batch_size,
                 disable=disable,
                 n_process=n_process,
                 component_cfg=component_cfg,
             )
-            for doc, context in zip(docs, contexts):
+            for doc in docs:
+                context = doc._context
+                doc._context = None
                 yield (doc, context)
             return
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index c74ee0b63..57d087958 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -56,7 +56,7 @@ cdef class Doc:
 
     cdef public bint has_unknown_spaces
 
-    cdef public list _py_tokens
+    cdef public object _context
 
     cdef int length
     cdef int max_length
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 0fa1d0d4f..46a10df03 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -29,6 +29,7 @@ class Doc:
     tensor: numpy.ndarray
     user_data: Dict[str, Any]
     has_unknown_spaces: bool
+    _context: Any
     @classmethod
     def set_extension(
         cls,

From 61daac54e4c2172a6ec0ae84858feb51e32a173c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 3 Nov 2021 07:51:53 +0100
Subject: [PATCH 049/133] Serialize _context separately in multiprocessing pipe
 (#9597)

* Serialize _context with Doc

* Revert "Serialize _context with Doc"

This reverts commit 161f1fac9115778f310eb4ce13ca7825c8129611.

* Serialize Doc._context separately for multiprocessing pipe
---
 spacy/language.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 49f6dd1a5..55c9912cc 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1631,11 +1631,12 @@ class Language:
             recv.recv() for recv in cycle(bytedocs_recv_ch)
         )
         try:
-            for i, (_, (byte_doc, byte_error)) in enumerate(
+            for i, (_, (byte_doc, byte_context, byte_error)) in enumerate(
                 zip(raw_texts, byte_tuples), 1
             ):
                 if byte_doc is not None:
                     doc = Doc(self.vocab).from_bytes(byte_doc)
+                    doc._context = byte_context
                     yield doc
                 elif byte_error is not None:
                     error = srsly.msgpack_loads(byte_error)
@@ -2186,12 +2187,12 @@ def _apply_pipes(
             for pipe in pipes:
                 docs = pipe(docs)  # type: ignore[arg-type, assignment]
             # Connection does not accept unpickable objects, so send list.
-            byte_docs = [(doc.to_bytes(), None) for doc in docs]
-            padding = [(None, None)] * (len(texts) - len(byte_docs))
+            byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
+            padding = [(None, None, None)] * (len(texts) - len(byte_docs))
             sender.send(byte_docs + padding)  # type: ignore[operator]
         except Exception:
-            error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
-            padding = [(None, None)] * (len(texts) - 1)
+            error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
+            padding = [(None, None, None)] * (len(texts) - 1)
             sender.send(error_msg + padding)
 
 

From 6eee024ff6fe12d3cac82e7daf3e582368bc670f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 3 Nov 2021 09:14:29 +0100
Subject: [PATCH 050/133] Pickle Doc._context (#9603)

---
 spacy/tests/doc/test_pickle_doc.py | 2 ++
 spacy/tokens/doc.pyx               | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py
index 28cb66714..738a751a0 100644
--- a/spacy/tests/doc/test_pickle_doc.py
+++ b/spacy/tests/doc/test_pickle_doc.py
@@ -5,9 +5,11 @@ from spacy.compat import pickle
 def test_pickle_single_doc():
     nlp = Language()
     doc = nlp("pickle roundtrip")
+    doc._context = 3
     data = pickle.dumps(doc, 1)
     doc2 = pickle.loads(data)
     assert doc2.text == "pickle roundtrip"
+    assert doc2._context == 3
 
 
 def test_list_of_docs_pickles_efficiently():
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3709cece0..362a17784 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1710,17 +1710,18 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
 def pickle_doc(doc):
     bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
     hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
-                      doc.user_token_hooks)
+                      doc.user_token_hooks, doc._context)
     return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
 
 
 def unpickle_doc(vocab, hooks_and_data, bytes_data):
-    user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
+    user_data, doc_hooks, span_hooks, token_hooks, _context = srsly.pickle_loads(hooks_and_data)
 
     doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"])
     doc.user_hooks.update(doc_hooks)
     doc.user_span_hooks.update(span_hooks)
     doc.user_token_hooks.update(token_hooks)
+    doc._context = _context
     return doc
 
 

From e43639b27afbd72d24e17e49dbb030ffff4e40cd Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 3 Nov 2021 08:55:30 +0000
Subject: [PATCH 051/133] Add note about round-trip serializing pipeline to API
 docs (#9583)

---
 website/docs/api/language.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index d0d6b9514..45c42040e 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -1000,6 +1000,11 @@ subclasses like `English` or `German` to make language-specific functionality
 like the [lexical attribute getters](/usage/linguistic-features#language-data)
 available to the loaded object.
 
+Note that if you want to serialize and reload a whole pipeline, using this alone
+won't work, you also need to handle the config. See
+["Serializing the pipeline"](https://spacy.io/usage/saving-loading#pipeline) for
+details.
+
 > #### Example
 >
 > ```python

From 79cea0398306142334a3e8c3ba9de29c00050ee7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 3 Nov 2021 09:56:00 +0100
Subject: [PATCH 052/133] Update website model display (#9589)

* Remove vectors from core trf model descriptions

* Update accuracy labels and exclude morph_acc for ja
---
 website/src/templates/models.js | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 554823ebf..69cec3376 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -31,7 +31,7 @@ const COMPONENT_LINKS = {
 
 const MODEL_META = {
     core: 'Vocabulary, syntax, entities, vectors',
-    core_sm: 'Vocabulary, syntax, entities',
+    core_no_vectors: 'Vocabulary, syntax, entities',
     dep: 'Vocabulary, syntax',
     ent: 'Named entities',
     sent: 'Sentence boundaries',
@@ -41,14 +41,16 @@ const MODEL_META = {
     web: 'written text (blogs, news, comments)',
     news: 'written text (news, media)',
     wiki: 'Wikipedia',
-    uas: 'Unlabelled dependencies',
-    las: 'Labelled dependencies',
-    dep_uas: 'Unlabelled dependencies',
-    dep_las: 'Labelled dependencies',
+    uas: 'Unlabeled dependencies',
+    las: 'Labeled dependencies',
+    dep_uas: 'Unlabeled dependencies',
+    dep_las: 'Labeled dependencies',
     token_acc: 'Tokenization',
     tok: 'Tokenization',
     lemma: 'Lemmatization',
     morph: 'Morphological analysis',
+    lemma_acc: 'Lemmatization',
+    morph_acc: 'Morphological analysis',
     tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
     tag_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
     tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
@@ -115,8 +117,8 @@ function formatVectors(data) {
     return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)`
 }
 
-function formatAccuracy(data) {
-    const exclude = ['speed']
+function formatAccuracy(data, lang) {
+    const exclude = (lang !== "ja") ? ['speed'] : ['speed', 'morph_acc']
     if (!data) return []
     return Object.keys(data)
         .map(label => {
@@ -147,8 +149,7 @@ function formatModelMeta(data) {
         license: data.license,
         labels: isEmptyObj(data.labels) ? null : data.labels,
         vectors: formatVectors(data.vectors),
-        // TODO: remove accuracy fallback
-        accuracy: formatAccuracy(data.accuracy || data.performance),
+        accuracy: formatAccuracy(data.performance, data.lang),
     }
 }
 
@@ -196,7 +197,7 @@ const Model = ({
     const [isError, setIsError] = useState(true)
     const [meta, setMeta] = useState({})
     const { type, genre, size } = getModelComponents(name)
-    const display_type = type === 'core' && size === 'sm' ? 'core_sm' : type
+    const display_type = type === 'core' && (size === 'sm' || size === 'trf') ? 'core_no_vectors' : type
     const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
         name,
         compatibility,

From db0d8c56d0ec6572030c9aa5fc6a2a23ff5a521f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 3 Nov 2021 10:57:34 +0100
Subject: [PATCH 053/133] Add test for Language.pipe as_tuples with custom
 error handlers (#9608)

* make nlp.pipe() return None docs when no exceptions are (re-)raised during error handling

* Remove changes other than as_tuples test

* Only check warning count for one process

* Fix types

* Format

Co-authored-by: Xi Bai <xi.bai.ed@gmail.com>
---
 spacy/language.py            |  5 +--
 spacy/tests/test_language.py | 80 +++++++++++++++++++++++++-----------
 2 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 55c9912cc..aa57989ac 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1537,8 +1537,7 @@ class Language:
                 yield (doc, context)
             return
 
-        # At this point, we know that we're dealing with an iterable of plain texts
-        texts = cast(Iterable[str], texts)
+        texts = cast(Iterable[Union[str, Doc]], texts)
 
         # Set argument defaults
         if n_process == -1:
@@ -1592,7 +1591,7 @@ class Language:
 
     def _multiprocessing_pipe(
         self,
-        texts: Iterable[str],
+        texts: Iterable[Union[str, Doc]],
         pipes: Iterable[Callable[..., Iterator[Doc]]],
         n_process: int,
         batch_size: int,
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 444b1c83e..c5fdc8eb0 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -255,6 +255,38 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process):
             assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
 
 
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
+    """Test the error handling of nlp.pipe with input as tuples"""
+    Language.component("my_evil_component", func=evil_component)
+    ops = get_current_ops()
+    if isinstance(ops, NumpyOps) or n_process < 2:
+        nlp = English()
+        nlp.add_pipe("my_evil_component")
+        texts = [
+            ("TEXT 111", 111),
+            ("TEXT 222", 222),
+            ("TEXT 333", 333),
+            ("TEXT 342", 342),
+            ("TEXT 666", 666),
+        ]
+        with pytest.raises(ValueError):
+            list(nlp.pipe(texts, as_tuples=True))
+        nlp.set_error_handler(warn_error)
+        logger = logging.getLogger("spacy")
+        with mock.patch.object(logger, "warning") as mock_warning:
+            tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
+            # HACK/TODO? the warnings in child processes don't seem to be
+            # detected by the mock logger
+            if n_process == 1:
+                mock_warning.assert_called()
+                assert mock_warning.call_count == 2
+                assert len(tuples) + mock_warning.call_count == len(texts)
+            assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111)
+            assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333)
+            assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
+
+
 @pytest.mark.parametrize("n_process", [1, 2])
 def test_language_pipe_error_handler_pipe(en_vocab, n_process):
     """Test the error handling of a component's pipe method"""
@@ -515,19 +547,19 @@ def test_spacy_blank():
 @pytest.mark.parametrize(
     "lang,target",
     [
-        ('en', 'en'),
-        ('fra', 'fr'),
-        ('fre', 'fr'),
-        ('iw', 'he'),
-        ('mo', 'ro'),
-        ('mul', 'xx'),
-        ('no', 'nb'),
-        ('pt-BR', 'pt'),
-        ('xx', 'xx'),
-        ('zh-Hans', 'zh'),
-        ('zh-Hant', None),
-        ('zxx', None)
-    ]
+        ("en", "en"),
+        ("fra", "fr"),
+        ("fre", "fr"),
+        ("iw", "he"),
+        ("mo", "ro"),
+        ("mul", "xx"),
+        ("no", "nb"),
+        ("pt-BR", "pt"),
+        ("xx", "xx"),
+        ("zh-Hans", "zh"),
+        ("zh-Hant", None),
+        ("zxx", None),
+    ],
 )
 def test_language_matching(lang, target):
     """
@@ -540,17 +572,17 @@ def test_language_matching(lang, target):
 @pytest.mark.parametrize(
     "lang,target",
     [
-        ('en', 'en'),
-        ('fra', 'fr'),
-        ('fre', 'fr'),
-        ('iw', 'he'),
-        ('mo', 'ro'),
-        ('mul', 'xx'),
-        ('no', 'nb'),
-        ('pt-BR', 'pt'),
-        ('xx', 'xx'),
-        ('zh-Hans', 'zh'),
-    ]
+        ("en", "en"),
+        ("fra", "fr"),
+        ("fre", "fr"),
+        ("iw", "he"),
+        ("mo", "ro"),
+        ("mul", "xx"),
+        ("no", "nb"),
+        ("pt-BR", "pt"),
+        ("xx", "xx"),
+        ("zh-Hans", "zh"),
+    ],
 )
 def test_blank_languages(lang, target):
     """

From e06bbf72a4cda6e2ae41f1948290eb18e20bfe87 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 3 Nov 2021 15:11:07 +0100
Subject: [PATCH 054/133] Fix tok2vec-less textcat generation in website
 quickstart (#9610)

---
 spacy/cli/templates/quickstart_training.jinja | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 50dbc6e42..b78806fec 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -16,8 +16,10 @@ gpu_allocator = null
 
 [nlp]
 lang = "{{ lang }}"
-{%- set no_tok2vec = components|length == 1 and (("textcat" in components or "textcat_multilabel" in components) and optimize == "efficiency")-%}
-{%- if not no_tok2vec and ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or "textcat" in components or "textcat_multilabel" in components) -%}
+{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
+{%- set with_accuracy = optimize == "accuracy" -%}
+{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
 {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
 {%- else -%}
 {%- set full_pipeline = components %}
@@ -199,7 +201,7 @@ no_output_layer = false
 
 {# NON-TRANSFORMER PIPELINE #}
 {% else -%}
-{% if not no_tok2vec-%}
+{% if "tok2vec" in full_pipeline -%}
 [components.tok2vec]
 factory = "tok2vec"
 

From c1cc94a33aca9304a5351b3da7c9ebd923eea396 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 3 Nov 2021 14:16:55 +0000
Subject: [PATCH 055/133] Fix typo about receptive field size (#9564)

---
 spacy/ml/models/tok2vec.py        | 2 +-
 website/docs/api/architectures.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 8d78e418f..44ab50e85 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -53,7 +53,7 @@ def build_hash_embed_cnn_tok2vec(
     window_size (int): The number of tokens on either side to concatenate during
         the convolutions. The receptive field of the CNN will be
         depth * (window_size * 2 + 1), so a 4-layer network with window_size of
-        2 will be sensitive to 17 words at a time. Recommended value is 1.
+        2 will be sensitive to 20 words at a time. Recommended value is 1.
     embed_size (int): The number of rows in the hash embedding tables. This can
         be surprisingly small, due to the use of the hash embeddings. Recommended
         values are between 2000 and 10000.
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 7044a7d02..01ca4540b 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -82,7 +82,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
 | `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                          |
 | `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                                |
 | `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                            |
-| `window_size`        | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
+| `window_size`        | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 20 words at a time. Recommended value is `1`. ~~int~~ |
 | `maxout_pieces`      | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~                                                                                   |
 | `subword_features`   | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~                                       |
 | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~                                                                                                                                                                                                                                  |

From cab9209c3dfcd1b75dfe5657f10e52c4d847a3cf Mon Sep 17 00:00:00 2001
From: Bram Vanroy <Bram.Vanroy@UGent.be>
Date: Wed, 3 Nov 2021 15:29:32 +0100
Subject: [PATCH 056/133] use metaclass to decorate errors (#9593)

---
 spacy/errors.py            | 25 +++++++++----------------
 spacy/tests/test_errors.py |  5 ++---
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index ff1185361..2da52e3b8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,18 +1,13 @@
 import warnings
 
 
-def add_codes(err_cls):
-    """Add error codes to string messages via class attribute names."""
-
-    class ErrorsWithCodes(err_cls):
-        def __getattribute__(self, code):
-            msg = super(ErrorsWithCodes, self).__getattribute__(code)
-            if code.startswith("__"):  # python system attributes like __class__
-                return msg
-            else:
-                return "[{code}] {msg}".format(code=code, msg=msg)
-
-    return ErrorsWithCodes()
+class ErrorsWithCodes(type):
+    def __getattribute__(self, code):
+        msg = super().__getattribute__(code)
+        if code.startswith("__"):  # python system attributes like __class__
+            return msg
+        else:
+            return "[{code}] {msg}".format(code=code, msg=msg)
 
 
 def setup_default_warnings():
@@ -44,8 +39,7 @@ def _escape_warning_msg(msg):
 
 # fmt: off
 
-@add_codes
-class Warnings:
+class Warnings(metaclass=ErrorsWithCodes):
     W005 = ("Doc object not parsed. This means displaCy won't be able to "
             "generate a dependency visualization for it. Make sure the Doc "
             "was processed with a model that supports dependency parsing, and "
@@ -194,8 +188,7 @@ class Warnings:
             "lead to errors.")
 
 
-@add_codes
-class Errors:
+class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
     E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
             "This usually happens when spaCy calls `nlp.{method}` with a custom "
diff --git a/spacy/tests/test_errors.py b/spacy/tests/test_errors.py
index e79abc6ab..a845a52c9 100644
--- a/spacy/tests/test_errors.py
+++ b/spacy/tests/test_errors.py
@@ -2,11 +2,10 @@ from inspect import isclass
 
 import pytest
 
-from spacy.errors import add_codes
+from spacy.errors import ErrorsWithCodes
 
 
-@add_codes
-class Errors:
+class Errors(metaclass=ErrorsWithCodes):
     E001 = "error description"
 
 

From 6e6650307d6f785bc16f09178f3521a82f7bd3fa Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Thu, 4 Nov 2021 23:55:49 +0100
Subject: [PATCH 057/133] Portuguese noun chunks review (#9559)

* added tests

* added pt vocab

* transferred spanish

* added syntax iters

* fixed parenthesis

* added nmod example

* added relative pron

* fixed rel pron

* added rel subclause

* corrected typo

* added more NP chains

* long sentence

* fixed typo

* fixed typo

* fixed typo

* corrected heads

* added passive subj

* added pass subj

* added passive obj

* refinement to rights

* went back to odl

* fixed test

* fixed typo

* fixed typo

* formatted

* Format

* Format test cases

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/lang/pt/__init__.py               |   2 +
 spacy/lang/pt/syntax_iterators.py       |  85 +++++++++
 spacy/tests/conftest.py                 |   5 +
 spacy/tests/lang/pt/test_noun_chunks.py | 221 ++++++++++++++++++++++++
 4 files changed, 313 insertions(+)
 create mode 100644 spacy/lang/pt/syntax_iterators.py
 create mode 100644 spacy/tests/lang/pt/test_noun_chunks.py

diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index 9ae6501fb..454002491 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -1,6 +1,7 @@
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
+from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from ...language import Language, BaseDefaults
 
@@ -10,6 +11,7 @@ class PortugueseDefaults(BaseDefaults):
     infixes = TOKENIZER_INFIXES
     prefixes = TOKENIZER_PREFIXES
     lex_attr_getters = LEX_ATTRS
+    syntax_iterators = SYNTAX_ITERATORS
     stop_words = STOP_WORDS
 
 
diff --git a/spacy/lang/pt/syntax_iterators.py b/spacy/lang/pt/syntax_iterators.py
new file mode 100644
index 000000000..62661f5e4
--- /dev/null
+++ b/spacy/lang/pt/syntax_iterators.py
@@ -0,0 +1,85 @@
+from typing import Union, Iterator, Tuple
+
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "obl:agent",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
+    np_label = doc.vocab.strings.add("NP")
+    adj_label = doc.vocab.strings.add("amod")
+    det_label = doc.vocab.strings.add("det")
+    det_pos = doc.vocab.strings.add("DET")
+    adp_label = doc.vocab.strings.add("ADP")
+    conj = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
+        if word.dep in np_deps:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
+
+            if right_child:
+                if (
+                    right_child.dep == adj_label
+                ):  # allow chain of adjectives by expanding to right
+                    right_end = right_child.right_edge
+                elif (
+                    right_child.dep == det_label and right_child.pos == det_pos
+                ):  # cut relative pronouns here
+                    right_end = right_child
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
+            else:
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_label else left_index
+            )
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index b4819ed5c..afe23888d 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -247,6 +247,11 @@ def pt_tokenizer():
     return get_lang_class("pt")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def pt_vocab():
+    return get_lang_class("pt")().vocab
+
+
 @pytest.fixture(scope="session")
 def ro_tokenizer():
     return get_lang_class("ro")().tokenizer
diff --git a/spacy/tests/lang/pt/test_noun_chunks.py b/spacy/tests/lang/pt/test_noun_chunks.py
new file mode 100644
index 000000000..9a42ce268
--- /dev/null
+++ b/spacy/tests/lang/pt/test_noun_chunks.py
@@ -0,0 +1,221 @@
+from spacy.tokens import Doc
+import pytest
+
+
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # determiner + noun
+        # um cachorro -> um cachorro
+        (
+            ["um", "cachorro"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # two determiners + noun
+        # meu o pai -> meu o pai
+        (
+            ["meu", "o", "pai"],
+            [2, 2, 2],
+            ["det", "det", "ROOT"],
+            ["DET", "DET", "NOUN"],
+            [(0, 3)],
+        ),
+        # two determiners + noun
+        # todos essos caros -> todos essos caros
+        (
+            ["todos", "essos", "caros"],
+            [2, 2, 2],
+            ["det", "det", "ROOT"],
+            ["DET", "DET", "NOUN"],
+            [(0, 3)],
+        ),
+        # two determiners, one is after noun
+        # um irmão meu -> um irmão meu
+        (
+            ["um", "irmão", "meu"],
+            [1, 1, 1],
+            ["det", "ROOT", "det"],
+            ["DET", "NOUN", "DET"],
+            [(0, 3)],
+        ),
+        # two determiners + noun
+        # o meu pai -> o meu pai
+        (
+            ["o", "meu", "pai"],
+            [2, 2, 2],
+            ["det","det", "ROOT"],
+            ["DET", "DET", "NOUN"],
+            [(0, 3)],
+        ),
+        # relative pronoun
+        # A bicicleta essa está estragada -> A bicicleta
+        (
+            ['A', 'bicicleta', 'essa', 'está', 'estragada'],
+            [1, 4, 1, 4, 4],
+            ['det', 'nsubj', 'det', 'cop', 'ROOT'],
+            ['DET', 'NOUN', 'PRON', 'AUX', 'ADJ'],
+            [(0,2)]
+        ),
+        # relative subclause
+        #  o computador que comprou -> o computador
+        (
+            ['o', 'computador', 'que', 'comprou'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'nsubj', 'acl:relcl'],
+            ['DET', 'NOUN', 'PRON', 'VERB'],
+            [(0, 2), (2, 3)]
+        ),
+        # det + noun + adj
+        # O cachorro marrom  -> O cachorro marrom
+        (
+            ["O", "cachorro", "marrom"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # det + noun + adj plural
+        # As calças baratas  -> As calças baratas
+        (
+            ["As", "calças", "baratas"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # det + adj + noun
+        # Uma boa ideia -> Uma boa ideia
+        (
+            ['uma', 'boa', 'ideia'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # multiple adjectives
+        # Uma garota esperta e inteligente -> Uma garota esperta e inteligente
+        (
+            ["Uma", "garota", "esperta", "e", "inteligente"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # determiner, adjective, compound created by flat
+        # a grande São Paolo -> a grande São Paolo
+        (
+            ["a", "grande", "São", "Paolo"],
+            [2, 2, 2, 2],
+            ["det", "amod", "ROOT", "flat:name"],
+            ["DET", "ADJ", "PROPN", "PROPN"],
+            [(0,4)]
+        ),
+        # one determiner + one noun + one adjective qualified by an adverb
+        # alguns fazendeiros muito ricos -> alguns fazendeiros muito ricos
+        (
+            ['alguns', 'fazendeiros', 'muito', 'ricos'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'advmod', 'amod'],
+            ['DET', 'NOUN', 'ADV', 'ADJ'],
+            [(0,4)]
+        ),
+        # Two NPs conjuncted
+        # Eu tenho um cachorro e um gato -> Eu, um cacharo, um gato
+        ( 
+            ["Eu", "tenho", "um", "cachorro", "e", "um", "gato"],
+            [1, 1, 3, 1, 6, 6, 3],
+            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,1), (2,4), (5,7)]
+         
+        ),
+        # Two NPs together
+        # o escritor brasileiro Aníbal Machado -> o escritor brasileiro, Aníbal Machado
+        (
+            ['o', 'escritor', 'brasileiro', 'Aníbal', 'Machado'],
+            [1, 1, 1, 1, 3],
+            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
+            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+            [(0, 3), (3, 5)]
+        ),
+        # Noun compound, person name and titles
+        # Dom Pedro II -> Dom Pedro II
+        (
+            ["Dom", "Pedro", "II"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Noun compound created by flat
+        # os Estados Unidos -> os Estados Unidos
+        (
+            ["os", "Estados", "Unidos"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat:name"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # nmod relation between NPs
+        # a destruição da cidade -> a destruição, cidade
+        (
+            ['a', 'destruição', 'da', 'cidade'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'case', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'NOUN'],
+            [(0,2), (3,4)]
+        ),
+        # Compounding by nmod, several NPs chained together
+        # a primeira fábrica de medicamentos do governo -> a primeira fábrica, medicamentos, governo
+        (
+            ["a", "primeira", "fábrica", "de", "medicamentos",  "do", "governo"],
+            [2, 2, 2, 4, 2, 6, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(0, 3), (4, 5), (6, 7)]
+        ),
+        # several NPs
+        # Tradução da reportagem de Susana -> Tradução, reportagem, Susana
+        (
+            ['Tradução', 'da', 'reportagem', 'de', 'Susana'],
+            [0, 2, 0, 4, 2],
+            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3), (4,5)]  
+       
+        ),
+        # Several NPs
+        # O gato gordo da Susana e seu amigo -> O gato gordo, Susana, seu amigo
+        (  
+            ['O', 'gato', 'gordo', 'da', 'Susana', 'e', 'seu', 'amigo'],
+            [1, 1, 1, 4, 1, 7, 7, 1],
+            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'],
+            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,8)]
+        ),
+        # Passive subject
+        # Os novos gastos são alimentados pela grande conta bancária de Clinton -> Os novos gastos, grande conta bancária, Clinton
+        (
+            ['Os', 'novos', 'gastos', 'são', 'alimentados', 'pela', 'grande', 'conta', 'bancária', 'de', 'Clinton'],
+            [2, 2, 4, 4, 4, 7, 7, 4, 7, 10, 7],
+            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
+            [(0, 3), (6, 9), (10, 11)]
+        )
+    ],
+)
+# fmt: on
+def test_pt_noun_chunks(pt_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(pt_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
+def test_noun_chunks_is_parsed_pt(pt_tokenizer):
+    """Test that noun_chunks raises Value Error for 'pt' language if Doc is not parsed."""
+    doc = pt_tokenizer("en Oxford este verano")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)

From f0e8c9fe58267edaef3d82fe12ad5c0fb5c431e6 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Fri, 5 Nov 2021 00:46:36 +0100
Subject: [PATCH 058/133] Spanish noun chunks review (#9537)

* updated syntax iters

* formatted the code

* added prepositional objects

* code clean up

* eliminated left attached adp

* added es vocab

* added basic tests

* fixed typo

* fixed typo

* list to set

* fixed doc name

* added code for conj

* more tests

* differentiated adjectives and flat

* fixed typo

* added compounds

* more compounds

* tests for compounds

* tests for nominal modifiers

* fixed typo

* fixed typo

* formatted file

* reformatted tests

* fixed typo

* fixed punct typo

* formatted after changes

* added indirect object

* added full sentence examples

* added longer full sentence examples

* fixed sentence length of test

* added passive subj

* added test case by Damian
---
 spacy/lang/es/syntax_iterators.py       | 106 ++++++++++-------
 spacy/tests/conftest.py                 |   5 +
 spacy/tests/lang/es/test_noun_chunks.py | 150 ++++++++++++++++++++++++
 3 files changed, 217 insertions(+), 44 deletions(-)

diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index 8b385a1b9..f2ca2a678 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -1,58 +1,76 @@
 from typing import Union, Iterator, Tuple
 
-from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
-from ...tokens import Doc, Span, Token
+from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
-    doc = doclike.doc
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "fixed", "compound"]
+    doc = doclike.doc  # Ensure works on both Doc and Span.
     if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
-    if not len(doc):
-        return
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
     np_label = doc.vocab.strings.add("NP")
-    left_labels = ["det", "fixed", "neg"]  # ['nunmod', 'det', 'appos', 'fixed']
-    right_labels = ["flat", "fixed", "compound", "neg"]
-    stop_labels = ["punct"]
-    np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
-    np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
-    stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
+    adj_label = doc.vocab.strings.add("amod")
+    adp_label = doc.vocab.strings.add("ADP")
+    conj = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
+        if word.dep in np_deps:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
 
-    prev_right = -1
-    for token in doclike:
-        if token.pos in [PROPN, NOUN, PRON]:
-            left, right = noun_bounds(
-                doc, token, np_left_deps, np_right_deps, stop_deps
-            )
-            if left.i <= prev_right:
-                continue
-            yield left.i, right.i + 1, np_label
-            prev_right = right.i
-
-
-def is_verb_token(token: Token) -> bool:
-    return token.pos in [VERB, AUX]
-
-
-def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
-    left_bound = root
-    for token in reversed(list(root.lefts)):
-        if token.dep in np_left_deps:
-            left_bound = token
-    right_bound = root
-    for token in root.rights:
-        if token.dep in np_right_deps:
-            left, right = noun_bounds(
-                doc, token, np_left_deps, np_right_deps, stop_deps
-            )
-            filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
-            if list(filter(filter_func, doc[left_bound.i : right.i])):
-                break
+            if right_child:
+                if right_child.dep == adj_label:
+                    right_end = right_child.right_edge
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
             else:
-                right_bound = right
-    return left_bound, right_bound
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_label else left_index
+            )  # Eliminate left attached de, del
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
 
 
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index afe23888d..88c7adfe3 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -120,6 +120,11 @@ def es_tokenizer():
     return get_lang_class("es")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def es_vocab():
+    return get_lang_class("es")().vocab
+
+
 @pytest.fixture(scope="session")
 def eu_tokenizer():
     return get_lang_class("eu")().tokenizer
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
index e5afd81c9..6118a0458 100644
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -1,6 +1,156 @@
+from spacy.tokens import Doc
 import pytest
 
 
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # un gato -> "un gato"
+        (
+            ["un", "gato"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # la camisa negra -> "la camisa negra"
+        (
+            ["la", "camisa", "negra"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # un lindo gatito -> "un lindo gatito"
+        (
+            ["Un", "lindo", "gatito"],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # una chica hermosa e inteligente -> una chica hermosa e inteligente
+        (
+            ["Una", "chica", "hermosa", "e", "inteligente"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # el fabuloso gato pardo -> "el fabuloso gato pardo"
+        (
+            ["el", "fabuloso", "gato", "pardo"],
+            [2, 2, 2, 2],
+            ["det", "amod", "ROOT", "amod"],
+            ["DET", "ADJ", "NOUN", "ADJ"],
+            [(0,4)]
+        ),
+        # Tengo un gato y un perro -> un gato, un perro
+        ( 
+            ["Tengo", "un", "gato", "y", "un", "perro"],
+            [0, 2, 0, 5, 5, 0],
+            ["ROOT", "det", "obj", "cc", "det", "conj"],
+            ["VERB", "DET", "NOUN", "CCONJ", "DET", "NOUN"],
+            [(1,3), (4,6)]
+         
+        ),
+        # Dom Pedro II -> Dom Pedro II
+        (
+            ["Dom", "Pedro", "II"],
+            [0, 0, 0],
+            ["ROOT", "flat", "flat"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # los Estados Unidos -> los Estados Unidos
+        (
+            ["los", "Estados", "Unidos"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Miguel de Cervantes -> Miguel de Cervantes
+        (
+            ["Miguel", "de", "Cervantes"],
+            [0, 2, 0],
+            ["ROOT", "case", "flat"],
+            ["PROPN", "ADP", "PROPN"],
+            [(0,3)]
+        ),
+        (
+            ["Rio", "de", "Janeiro"],
+            [0, 2, 0],
+            ["ROOT", "case", "flat"],
+            ["PROPN", "ADP", "PROPN"],
+            [(0,3)]
+        ),
+        # la destrucción de la ciudad -> la destrucción, la ciudad
+        (
+            ["la", "destrucción", "de", "la", "ciudad"],
+            [1, 1, 4, 4, 1],
+            ['det', 'ROOT', 'case', 'det', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
+            [(0,2), (3,5)]
+        ),
+        # la traducción de Susana del informe -> la traducción, Susana, informe
+        (
+            ['la', 'traducción', 'de', 'Susana', 'del', 'informe'],
+            [1, 1, 3, 1, 5, 1],
+            ['det', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'PROPN', 'ADP', 'NOUN'],
+            [(0,2), (3,4), (5,6)]  
+       
+        ),
+        # El gato regordete de Susana y su amigo -> el gato regordete, Susana, su amigo
+        (  
+            ['El', 'gato', 'regordete', 'de', 'Susana', 'y', 'su', 'amigo'],
+            [1, 1, 1, 4, 1, 7, 7, 1],
+            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'],
+            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,8)]
+        ),
+        # Afirmó que sigue el criterio europeo y que trata de incentivar el mercado donde no lo hay -> el criterio europeo, el mercado, donde, lo
+        (
+            ['Afirmó', 'que', 'sigue', 'el', 'criterio', 'europeo', 'y', 'que', 'trata', 'de', 'incentivar', 'el', 'mercado', 'donde', 'no', 'lo', 'hay'],
+            [0, 2, 0, 4, 2, 4, 8, 8, 2, 10, 8, 12, 10, 16, 16, 16, 0],
+            ['ROOT', 'mark', 'ccomp', 'det', 'obj', 'amod', 'cc', 'mark', 'conj', 'mark', 'xcomp', 'det', 'obj', 'obl', 'advmod', 'obj', 'advcl'],
+            ['VERB', 'SCONJ', 'VERB', 'DET', 'NOUN', 'ADJ', 'CCONJ', 'SCONJ', 'VERB', 'ADP', 'VERB', 'DET', 'NOUN', 'PRON', 'ADV', 'PRON', 'AUX'],
+            [(3,6), (11,13), (13,14), (15,16)]
+        ),
+        # En este sentido se refirió a la reciente creación del Ministerio de Ciencia y Tecnología y a las primeras declaraciones de su titular, Anna Birulés, sobre el impulso de la investigación, desarrollo e innovación -> este sentido, se, la reciente creación, Ministerio de Ciencia y Tecnología, a las primeras declaraciones, su titular, , Anna Birulés,, el impulso, la investigación, , desarrollo, innovación
+        (
+            ['En', 'este', 'sentido', 'se', 'refirió', 'a', 'la', 'reciente', 'creación', 'del', 'Ministerio', 'de', 'Ciencia', 'y', 'Tecnología', 'y', 'a', 'las', 'primeras', 'declaraciones', 'de', 'su', 'titular', ',', 'Anna', 'Birulés', ',', 'sobre', 'el', 'impulso', 'de', 'la', 'investigación', ',', 'desarrollo', 'e', 'innovación'],
+            [2, 2, 4, 4, 4, 8, 8, 8, 4, 10, 8, 12, 10, 14, 12, 19, 19, 19, 19, 8, 22, 22, 19, 24, 22, 24, 24, 29, 29, 19, 32, 32, 29, 34, 32, 36, 32],
+            ['case', 'det', 'obl', 'obj', 'ROOT', 'case', 'det', 'amod', 'obj', 'case', 'nmod', 'case', 'flat', 'cc', 'conj', 'cc', 'case', 'det', 'amod', 'conj', 'case', 'det', 'nmod', 'punct', 'appos', 'flat', 'punct', 'case', 'det', 'nmod', 'case', 'det', 'nmod', 'punct', 'conj', 'cc', 'conj'],
+            ['ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'ADP', 'PROPN', 'CCONJ', 'PROPN', 'CCONJ', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'NOUN', 'CCONJ', 'NOUN'],
+            [(1, 3), (3, 4), (6, 9), (10, 15), (16, 20), (21, 23), (23, 27), (28, 30), (31, 33), (33, 35), (36, 37)]
+        ),
+        # Asimismo defiende la financiación pública de la investigación básica y pone de manifiesto que las empresas se centran más en la investigación y desarrollo con objetivos de mercado. -> la financiación pública, la investigación básica, manifiesto, las empresas, se, la investigación, desarrollo, objetivos, mercado
+        (
+            ['Asimismo', 'defiende', 'la', 'financiación', 'pública', 'de', 'la', 'investigación', 'básica', 'y', 'pone', 'de', 'manifiesto', 'que', 'las', 'empresas', 'se', 'centran', 'más', 'en', 'la', 'investigación', 'y', 'desarrollo', 'con', 'objetivos', 'de', 'mercado'],
+            [1, 1, 3, 1, 3, 7, 7, 3, 7, 10, 1, 12, 10, 17, 15, 17, 17, 10, 17, 21, 21, 17, 23, 21, 25, 17, 27, 25],
+            ['advmod', 'ROOT', 'det', 'obj', 'amod', 'case', 'det', 'nmod', 'amod', 'cc', 'conj', 'case', 'obl', 'mark', 'det', 'nsubj', 'obj', 'ccomp', 'obj', 'case', 'det', 'obl', 'cc', 'conj', 'case', 'obl', 'case', 'nmod'],
+            ['ADV', 'VERB', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADJ', 'CCONJ', 'VERB', 'ADP', 'NOUN', 'SCONJ', 'DET', 'NOUN', 'PRON', 'VERB', 'ADV', 'ADP', 'DET', 'NOUN', 'CCONJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(2, 5), (6, 9), (12, 13), (14, 16), (16, 17), (20, 22), (23, 24), (25, 26), (27, 28)]
+        ),
+        # Tras indicar que la inversión media en investigación en la Unión Europea se sitúa en el 1,8 por ciento del PIB, frente al 2,8 por ciento en Japón y EEUU, Couceiro dijo que España está en "el buen camino" y se está creando un entorno propicio para la innovación empresarial' -> la inversión media, investigación, la Unión Europea, se, PIB, Japón, EEUU, Couceiro, España, se, un entorno propicio para la innovación empresaria
+        (
+            ['Tras', 'indicar', 'que', 'la', 'inversión', 'media', 'en', 'investigación', 'en', 'la', 'Unión', 'Europea', 'se', 'sitúa', 'en', 'el', '1,8', 'por', 'ciento', 'del', 'PIB', ',', 'frente', 'al', '2,8', 'por', 'ciento', 'en', 'Japón', 'y', 'EEUU', ',', 'Couceiro', 'dijo', 'que', 'España', 'está', 'en', '"', 'el', 'buen', 'camino', '"', 'y', 'se', 'está', 'creando', 'un', 'entorno', 'propicio', 'para', 'la', 'innovación', 'empresarial'],
+            [1, 33, 13, 4, 13, 4, 7, 4, 10, 10, 4, 10, 13, 1, 16, 16, 13, 18, 16, 20, 16, 24, 24, 22, 13, 26, 24, 28, 24, 30, 28, 1, 33, 33, 41, 41, 41, 41, 41, 41, 41, 33, 41, 46, 46, 46, 33, 48, 46, 48, 52, 52, 49, 52],
+            ['mark', 'advcl', 'mark', 'det', 'nsubj', 'amod', 'case', 'nmod', 'case', 'det', 'nmod', 'flat', 'obj', 'ccomp', 'case', 'det', 'obj', 'case', 'compound', 'case', 'nmod', 'punct', 'case', 'fixed', 'obl', 'case', 'compound', 'case', 'nmod', 'cc', 'conj', 'punct', 'nsubj', 'ROOT', 'mark', 'nsubj', 'cop', 'case', 'punct', 'det', 'amod', 'ccomp', 'punct', 'cc', 'obj', 'aux', 'conj', 'det', 'nsubj', 'amod', 'case', 'det', 'nmod', 'amod'],
+            ['ADP', 'VERB', 'SCONJ', 'DET', 'NOUN', 'ADJ', 'ADP', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PRON', 'VERB', 'ADP', 'DET', 'NUM', 'ADP', 'NUM', 'ADP', 'PROPN', 'PUNCT', 'NOUN', 'ADP', 'NUM', 'ADP', 'NUM', 'ADP', 'PROPN', 'CCONJ', 'PROPN', 'PUNCT', 'PROPN', 'VERB', 'SCONJ', 'PROPN', 'AUX', 'ADP', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'CCONJ', 'PRON', 'AUX', 'VERB', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADJ'],
+            [(3, 6), (7, 8), (9, 12), (12, 13), (20, 21), (28, 29), (30, 31), (32, 33), (35, 36), (44, 45), (47, 54)]
+        ),
+    ],
+)
+# fmt: on
+def test_es_noun_chunks(es_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(es_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
 def test_noun_chunks_is_parsed_es(es_tokenizer):
     """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
     doc = es_tokenizer("en Oxford este verano")

From 199943deb4da7c68f08f578b404dbc6208cc41ac Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Fri, 5 Nov 2021 10:33:53 +0800
Subject: [PATCH 059/133] Add simple script to add pytest marks

---
 spacy/tests/regression/util_add_marker.py | 41 +++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 spacy/tests/regression/util_add_marker.py

diff --git a/spacy/tests/regression/util_add_marker.py b/spacy/tests/regression/util_add_marker.py
new file mode 100644
index 000000000..94fa415bc
--- /dev/null
+++ b/spacy/tests/regression/util_add_marker.py
@@ -0,0 +1,41 @@
+import re
+from pathlib import Path
+from typing import Optional
+
+import typer
+
+
+def main(
+    filename: Path, out_file: Optional[Path] = typer.Option(None), dry_run: bool = False
+):
+    """Add pytest issue markers on regression tests
+
+    If --out-file is not used, it will overwrite the original file. You can set
+    the --dry-run flag to just see the changeset and not write to disk.
+    """
+    lines = []
+    with filename.open() as f:
+        lines = f.readlines()
+
+    # Regex pattern for matching common regression formats (e.g. test_issue1234)
+    pattern = r"def test_issue\d{1,4}"
+    regex = re.compile(pattern)
+
+    new_lines = []
+    for line_text in lines:
+        if regex.search(line_text):  # if match, append marker first
+            issue_num = int(re.findall(r"\d+", line_text)[0])  # Simple heuristic
+            typer.echo(f"Found: {line_text} with issue number: {issue_num}")
+            new_lines.append(f"@pytest.mark.issue({issue_num})\n")
+        new_lines.append(line_text)
+
+    # Save to file
+    if not dry_run:
+        out = out_file or filename
+        with out.open("w") as f:
+            for new_line in new_lines:
+                f.write(new_line)
+
+
+if __name__ == "__main__":
+    typer.run(main)

From 91dec2c76e9affbaafb62cc6a95b317db583c569 Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Fri, 5 Nov 2021 09:27:08 +0800
Subject: [PATCH 060/133] Decorate non-regression tests

---
 spacy/tests/lang/en/test_prefix_suffix_infix.py  | 1 +
 spacy/tests/lang/fr/test_prefix_suffix_infix.py  | 1 +
 spacy/tests/matcher/test_dependency_matcher.py   | 2 ++
 spacy/tests/matcher/test_matcher_logic.py        | 1 +
 spacy/tests/serialize/test_serialize_pipeline.py | 1 +
 5 files changed, 6 insertions(+)

diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py
index 9dfb54fd6..a903496e8 100644
--- a/spacy/tests/lang/en/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py
@@ -119,6 +119,7 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer):
     assert tokens[4].text == "Mr."
 
 
+@pytest.mark.issue(225)
 @pytest.mark.xfail(reason="Issue #225 - not yet implemented")
 def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
     tokens = en_tokenizer(
diff --git a/spacy/tests/lang/fr/test_prefix_suffix_infix.py b/spacy/tests/lang/fr/test_prefix_suffix_infix.py
index 7770f807b..272531b63 100644
--- a/spacy/tests/lang/fr/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/fr/test_prefix_suffix_infix.py
@@ -4,6 +4,7 @@ from spacy.lang.punctuation import TOKENIZER_INFIXES
 from spacy.lang.char_classes import ALPHA
 
 
+@pytest.mark.issue(768)
 @pytest.mark.parametrize(
     "text,expected_tokens", [("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])]
 )
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index 61ae43c52..1728c82af 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -370,6 +370,7 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
             assert doc_t_i == span_t_i + offset
 
 
+@pytest.mark.issue(9263)
 def test_dependency_matcher_order_issue(en_tokenizer):
     # issue from #9263
     doc = en_tokenizer("I like text")
@@ -415,6 +416,7 @@ def test_dependency_matcher_order_issue(en_tokenizer):
     assert matches == []
 
 
+@pytest.mark.issue(9263)
 def test_dependency_matcher_remove(en_tokenizer):
     # issue from #9263
     doc = en_tokenizer("The red book")
diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py
index dcbe1ff33..b96bb2032 100644
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@@ -152,6 +152,7 @@ def test_operator_combos(en_vocab):
             assert not matches, (string, pattern_str)
 
 
+@pytest.mark.issue(1450)
 def test_matcher_end_zero_plus(en_vocab):
     """Test matcher works when patterns end with * operator. (issue 1450)"""
     matcher = Matcher(en_vocab)
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 05871a524..eebf72638 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -162,6 +162,7 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
         assert label in tagger2.vocab.strings
 
 
+@pytest.mark.issue(1105)
 def test_serialize_textcat_empty(en_vocab):
     # See issue #1105
     cfg = {"model": DEFAULT_SINGLE_TEXTCAT_MODEL}

From addeb34bc4538cada8f373a16ea89c46dcf63f07 Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Fri, 5 Nov 2021 09:27:19 +0800
Subject: [PATCH 061/133] Decorate regression tests

Even if the issue number is already in the file, I still
decorated them just to follow the convention found in test_issue8168.py
---
 spacy/tests/regression/test_issue1-1000.py    | 33 +++++++++++++++++++
 spacy/tests/regression/test_issue1001-1500.py | 10 ++++++
 spacy/tests/regression/test_issue1501-2000.py | 24 ++++++++++++++
 spacy/tests/regression/test_issue2001-2500.py | 10 ++++++
 spacy/tests/regression/test_issue2501-3000.py | 15 +++++++++
 spacy/tests/regression/test_issue3001-3500.py | 17 ++++++++++
 spacy/tests/regression/test_issue3501-4000.py | 20 +++++++++++
 spacy/tests/regression/test_issue4001-4500.py | 15 +++++++++
 spacy/tests/regression/test_issue4501-5000.py | 11 +++++++
 spacy/tests/regression/test_issue5001-5500.py |  6 ++++
 spacy/tests/regression/test_issue5501-6000.py |  3 ++
 spacy/tests/regression/test_issue6001-6500.py |  2 ++
 spacy/tests/regression/test_issue6501-7000.py |  8 +++++
 spacy/tests/regression/test_issue7001-8000.py |  6 ++++
 spacy/tests/regression/test_issue7716.py      |  1 +
 spacy/tests/regression/test_issue8190.py      |  1 +
 spacy/tests/regression/test_issue8216.py      |  1 +
 17 files changed, 183 insertions(+)

diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index 6bb71f6f4..4846d2075 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -12,6 +12,7 @@ from spacy.tokens import Doc, Span
 from ..util import make_tempdir
 
 
+@pytest.mark.issue(118)
 @pytest.mark.parametrize(
     "patterns",
     [
@@ -39,6 +40,7 @@ def test_issue118(en_tokenizer, patterns):
     assert ents[0].end == 11
 
 
+@pytest.mark.issue(118)
 @pytest.mark.parametrize(
     "patterns",
     [
@@ -66,6 +68,7 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns):
     assert ents[0].end == 11
 
 
+@pytest.mark.issue(242)
 def test_issue242(en_tokenizer):
     """Test overlapping multi-word phrases."""
     text = "There are different food safety standards in different countries."
@@ -88,6 +91,7 @@ def test_issue242(en_tokenizer):
         doc.ents += tuple(matches)
 
 
+@pytest.mark.issue(309)
 def test_issue309(en_vocab):
     """Test Issue #309: SBD fails on empty string"""
     doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
@@ -96,6 +100,7 @@ def test_issue309(en_vocab):
     assert len(sents) == 1
 
 
+@pytest.mark.issue(351)
 def test_issue351(en_tokenizer):
     doc = en_tokenizer("   This is a cat.")
     assert doc[0].idx == 0
@@ -103,12 +108,14 @@ def test_issue351(en_tokenizer):
     assert doc[1].idx == 3
 
 
+@pytest.mark.issue(360)
 def test_issue360(en_tokenizer):
     """Test tokenization of big ellipsis"""
     tokens = en_tokenizer("$45...............Asking")
     assert len(tokens) > 2
 
 
+@pytest.mark.issue(361)
 @pytest.mark.parametrize("text1,text2", [("cat", "dog")])
 def test_issue361(en_vocab, text1, text2):
     """Test Issue #361: Equality of lexemes"""
@@ -116,6 +123,7 @@ def test_issue361(en_vocab, text1, text2):
     assert en_vocab[text1] != en_vocab[text2]
 
 
+@pytest.mark.issue(587)
 def test_issue587(en_tokenizer):
     """Test that Matcher doesn't segfault on particular input"""
     doc = en_tokenizer("a b; c")
@@ -131,12 +139,14 @@ def test_issue587(en_tokenizer):
     assert len(matches) == 2
 
 
+@pytest.mark.issue(588)
 def test_issue588(en_vocab):
     matcher = Matcher(en_vocab)
     with pytest.raises(ValueError):
         matcher.add("TEST", [[]])
 
 
+@pytest.mark.issue(590)
 def test_issue590(en_vocab):
     """Test overlapping matches"""
     doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
@@ -149,6 +159,7 @@ def test_issue590(en_vocab):
     assert len(matches) == 2
 
 
+@pytest.mark.issue(595)
 @pytest.mark.skip(reason="Old vocab-based lemmatization")
 def test_issue595():
     """Test lemmatization of base forms"""
@@ -164,6 +175,7 @@ def test_issue595():
     assert doc[2].lemma_ == "feed"
 
 
+@pytest.mark.issue(599)
 def test_issue599(en_vocab):
     doc = Doc(en_vocab)
     doc2 = Doc(doc.vocab)
@@ -171,12 +183,14 @@ def test_issue599(en_vocab):
     assert doc2.has_annotation("DEP")
 
 
+@pytest.mark.issue(600)
 def test_issue600():
     vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
     doc = Doc(vocab, words=["hello"])
     doc[0].tag_ = "NN"
 
 
+@pytest.mark.issue(615)
 def test_issue615(en_tokenizer):
     def merge_phrases(matcher, doc, i, matches):
         """Merge a phrase. We have to be careful here because we'll change the
@@ -204,6 +218,7 @@ def test_issue615(en_tokenizer):
     assert entities[0].label != 0
 
 
+@pytest.mark.issue(736)
 @pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
 def test_issue736(en_tokenizer, text, number):
     """Test that times like "7am" are tokenized correctly and that numbers are
@@ -213,6 +228,7 @@ def test_issue736(en_tokenizer, text, number):
     assert tokens[0].text == number
 
 
+@pytest.mark.issue(740)
 @pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
 def test_issue740(en_tokenizer, text):
     """Test that dates are not split and kept as one token. This behaviour is
@@ -222,6 +238,7 @@ def test_issue740(en_tokenizer, text):
     assert len(tokens) == 1
 
 
+@pytest.mark.issue(743)
 def test_issue743():
     doc = Doc(Vocab(), ["hello", "world"])
     token = doc[0]
@@ -230,6 +247,7 @@ def test_issue743():
     assert items[0] is token
 
 
+@pytest.mark.issue(744)
 @pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
 def test_issue744(en_tokenizer, text):
     """Test that 'were' and 'Were' are excluded from the contractions
@@ -239,6 +257,7 @@ def test_issue744(en_tokenizer, text):
     assert tokens[1].text.lower() == "were"
 
 
+@pytest.mark.issue(759)
 @pytest.mark.parametrize(
     "text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
 )
@@ -247,6 +266,7 @@ def test_issue759(en_tokenizer, text, is_num):
     assert tokens[0].like_num == is_num
 
 
+@pytest.mark.issue(775)
 @pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
 def test_issue775(en_tokenizer, text):
     """Test that 'Shell' and 'shell' are excluded from the contractions
@@ -256,6 +276,7 @@ def test_issue775(en_tokenizer, text):
     assert tokens[0].text == text
 
 
+@pytest.mark.issue(792)
 @pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
 def test_issue792(en_tokenizer, text):
     """Test for Issue #792: Trailing whitespace is removed after tokenization."""
@@ -263,6 +284,7 @@ def test_issue792(en_tokenizer, text):
     assert "".join([token.text_with_ws for token in doc]) == text
 
 
+@pytest.mark.issue(792)
 @pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
 def test_control_issue792(en_tokenizer, text):
     """Test base case for Issue #792: Non-trailing whitespace"""
@@ -270,6 +292,7 @@ def test_control_issue792(en_tokenizer, text):
     assert "".join([token.text_with_ws for token in doc]) == text
 
 
+@pytest.mark.issue(801)
 @pytest.mark.skip(
     reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218"
 )
@@ -292,6 +315,7 @@ def test_issue801(en_tokenizer, text, tokens):
     assert [t.text for t in doc] == tokens
 
 
+@pytest.mark.issue(805)
 @pytest.mark.parametrize(
     "text,expected_tokens",
     [
@@ -311,6 +335,7 @@ def test_issue805(sv_tokenizer, text, expected_tokens):
     assert expected_tokens == token_list
 
 
+@pytest.mark.issue(850)
 def test_issue850():
     """The variable-length pattern matches the succeeding token. Check we
     handle the ambiguity correctly."""
@@ -326,6 +351,7 @@ def test_issue850():
     assert end == 4
 
 
+@pytest.mark.issue(850)
 def test_issue850_basic():
     """Test Matcher matches with '*' operator and Boolean flag"""
     vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
@@ -340,6 +366,7 @@ def test_issue850_basic():
     assert end == 4
 
 
+@pytest.mark.issue(852)
 @pytest.mark.skip(
     reason="French exception list is not enabled in the default tokenizer anymore"
 )
@@ -352,6 +379,7 @@ def test_issue852(fr_tokenizer, text):
     assert len(tokens) == 1
 
 
+@pytest.mark.issue(859)
 @pytest.mark.parametrize(
     "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
 )
@@ -361,6 +389,7 @@ def test_issue859(en_tokenizer, text):
     assert doc.text == text
 
 
+@pytest.mark.issue(886)
 @pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
 def test_issue886(en_tokenizer, text):
     """Test that token.idx matches the original text index for texts with newlines."""
@@ -370,6 +399,7 @@ def test_issue886(en_tokenizer, text):
         assert text[token.idx] == token.text[0]
 
 
+@pytest.mark.issue(891)
 @pytest.mark.parametrize("text", ["want/need"])
 def test_issue891(en_tokenizer, text):
     """Test that / infixes are split correctly."""
@@ -378,6 +408,7 @@ def test_issue891(en_tokenizer, text):
     assert tokens[1].text == "/"
 
 
+@pytest.mark.issue(912)
 @pytest.mark.skip(reason="Old vocab-based lemmatization")
 @pytest.mark.parametrize(
     "text,tag,lemma",
@@ -390,6 +421,7 @@ def test_issue912(en_vocab, text, tag, lemma):
     assert doc[0].lemma_ == lemma
 
 
+@pytest.mark.issue(957)
 @pytest.mark.slow
 def test_issue957(en_tokenizer):
     """Test that spaCy doesn't hang on many punctuation characters.
@@ -405,6 +437,7 @@ def test_issue957(en_tokenizer):
         assert doc
 
 
+@pytest.mark.issue(999)
 def test_issue999():
     """Test that adding entities and resuming training works passably OK.
     There are two issues here:
diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py
index d6a4600e3..0a60e4477 100644
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
 from spacy.symbols import ORTH, LEMMA, POS
 
 
+@pytest.mark.issue(1061)
 def test_issue1061():
     """Test special-case works after tokenizing. Was caching problem."""
     text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
@@ -33,6 +34,7 @@ def test_issue1061():
 @pytest.mark.skip(
     reason="Can not be fixed without variable-width look-behind (which we don't want)"
 )
+@pytest.mark.issue(1235)
 def test_issue1235():
     """Test that g is not split of if preceded by a number and a letter"""
     nlp = English()
@@ -46,6 +48,7 @@ def test_issue1235():
     assert doc[4].text == "g"
 
 
+@pytest.mark.issue(1242)
 def test_issue1242():
     nlp = English()
     doc = nlp("")
@@ -56,6 +59,7 @@ def test_issue1242():
 
 
 @pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases")
+@pytest.mark.issue(1250)
 def test_issue1250():
     """Test cached special cases."""
     special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
@@ -67,6 +71,7 @@ def test_issue1250():
     assert lemmas == ["reimburse", ",", "reimburse", "..."]
 
 
+@pytest.mark.issue(1257)
 def test_issue1257():
     """Test that tokens compare correctly."""
     doc1 = Doc(Vocab(), words=["a", "b", "c"])
@@ -75,6 +80,7 @@ def test_issue1257():
     assert not doc1[0] == doc2[0]
 
 
+@pytest.mark.issue(1375)
 def test_issue1375():
     """Test that token.nbor() raises IndexError for out-of-bounds access."""
     doc = Doc(Vocab(), words=["0", "1", "2"])
@@ -86,6 +92,7 @@ def test_issue1375():
     assert doc[1].nbor(1).text == "2"
 
 
+@pytest.mark.issue(1434)
 def test_issue1434():
     """Test matches occur when optional element at end of short doc."""
     pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
@@ -111,6 +118,7 @@ def test_issue1434():
         ("a b b", 0, 3),
     ],
 )
+@pytest.mark.issue(1450)
 def test_issue1450(string, start, end):
     """Test matcher works when patterns end with * operator."""
     pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
@@ -124,6 +132,7 @@ def test_issue1450(string, start, end):
     assert matches[-1][2] == end
 
 
+@pytest.mark.issue(1488)
 def test_issue1488():
     prefix_re = re.compile(r"""[\[\("']""")
     suffix_re = re.compile(r"""[\]\)"']""")
@@ -147,6 +156,7 @@ def test_issue1488():
         assert token.text
 
 
+@pytest.mark.issue(1494)
 def test_issue1494():
     infix_re = re.compile(r"""[^a-z]""")
     test_cases = [
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index f85ec70e1..07f173843 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -17,6 +17,7 @@ from spacy.matcher import Matcher
 from ..util import make_tempdir
 
 
+@pytest.mark.issue(1506)
 def test_issue1506():
     def string_generator():
         for _ in range(10001):
@@ -40,6 +41,7 @@ def test_issue1506():
             str(t.lemma_)
 
 
+@pytest.mark.issue(1518)
 def test_issue1518():
     """Test vectors.resize() works."""
     vectors = Vectors(shape=(10, 10))
@@ -47,6 +49,7 @@ def test_issue1518():
     vectors.resize((5, 9))
 
 
+@pytest.mark.issue(1537)
 def test_issue1537():
     """Test that Span.as_doc() doesn't segfault."""
     string = "The sky is blue . The man is pink . The dog is purple ."
@@ -65,6 +68,7 @@ def test_issue1537():
 
 
 # TODO: Currently segfaulting, due to l_edge and r_edge misalignment
+@pytest.mark.issue(1537)
 # def test_issue1537_model():
 #    nlp = load_spacy('en')
 #    doc = nlp('The sky is blue. The man is pink. The dog is purple.')
@@ -73,12 +77,14 @@ def test_issue1537():
 #    print(list(sents[1].noun_chunks))
 
 
+@pytest.mark.issue(1539)
 def test_issue1539():
     """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
     v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
     v.resize((100, 100))
 
 
+@pytest.mark.issue(1547)
 def test_issue1547():
     """Test that entity labels still match after merging tokens."""
     words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
@@ -89,12 +95,14 @@ def test_issue1547():
     assert [ent.text for ent in doc.ents]
 
 
+@pytest.mark.issue(1612)
 def test_issue1612(en_tokenizer):
     doc = en_tokenizer("The black cat purrs.")
     span = doc[1:3]
     assert span.orth_ == span.text
 
 
+@pytest.mark.issue(1654)
 def test_issue1654():
     nlp = Language(Vocab())
     assert not nlp.pipeline
@@ -116,12 +124,14 @@ def test_issue1654():
 
 
 @pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
+@pytest.mark.issue(1698)
 def test_issue1698(en_tokenizer, text):
     doc = en_tokenizer(text)
     assert len(doc) == 1
     assert not doc[0].like_url
 
 
+@pytest.mark.issue(1727)
 def test_issue1727():
     """Test that models with no pretrained vectors can be deserialized
     correctly after vectors are added."""
@@ -138,6 +148,7 @@ def test_issue1727():
         assert tagger.cfg.get("pretrained_dims", 0) == 0
 
 
+@pytest.mark.issue(1757)
 def test_issue1757():
     """Test comparison against None doesn't cause segfault."""
     doc = Doc(Vocab(), words=["a", "b", "c"])
@@ -151,12 +162,14 @@ def test_issue1757():
     assert not doc.vocab["a"] < None
 
 
+@pytest.mark.issue(1758)
 def test_issue1758(en_tokenizer):
     """Test that "would've" is handled by the English tokenizer exceptions."""
     tokens = en_tokenizer("would've")
     assert len(tokens) == 2
 
 
+@pytest.mark.issue(1773)
 def test_issue1773(en_tokenizer):
     """Test that spaces don't receive a POS but no TAG. This is the root cause
     of the serialization issue reported in #1773."""
@@ -165,6 +178,7 @@ def test_issue1773(en_tokenizer):
         assert doc[0].tag_ != ""
 
 
+@pytest.mark.issue(1799)
 def test_issue1799():
     """Test sentence boundaries are deserialized correctly, even for
     non-projective sentences."""
@@ -186,6 +200,7 @@ def test_issue1799():
     assert len(list(doc.sents)) == 1
 
 
+@pytest.mark.issue(1807)
 def test_issue1807():
     """Test vocab.set_vector also adds the word to the vocab."""
     vocab = Vocab(vectors_name="test_issue1807")
@@ -194,6 +209,7 @@ def test_issue1807():
     assert "hello" in vocab
 
 
+@pytest.mark.issue(1834)
 def test_issue1834():
     """Test that sentence boundaries & parse/tag flags are not lost
     during serialization."""
@@ -217,6 +233,7 @@ def test_issue1834():
     assert new_doc.has_annotation("TAG")
 
 
+@pytest.mark.issue(1868)
 def test_issue1868():
     """Test Vocab.__contains__ works with int keys."""
     vocab = Vocab()
@@ -228,6 +245,7 @@ def test_issue1868():
     assert int_id not in vocab
 
 
+@pytest.mark.issue(1883)
 def test_issue1883():
     matcher = Matcher(Vocab())
     matcher.add("pat1", [[{"orth": "hello"}]])
@@ -239,11 +257,13 @@ def test_issue1883():
 
 
 @pytest.mark.parametrize("word", ["the"])
+@pytest.mark.issue(1889)
 def test_issue1889(word):
     assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
 
 
 @pytest.mark.skip(reason="obsolete with the config refactor of v.3")
+@pytest.mark.issue(1915)
 def test_issue1915():
     cfg = {"hidden_depth": 2}  # should error out
     nlp = Language()
@@ -253,6 +273,7 @@ def test_issue1915():
         nlp.initialize(**cfg)
 
 
+@pytest.mark.issue(1945)
 def test_issue1945():
     """Test regression in Matcher introduced in v2.0.6."""
     matcher = Matcher(Vocab())
@@ -264,6 +285,7 @@ def test_issue1945():
     assert matches[1][1:] == (1, 3)
 
 
+@pytest.mark.issue(1963)
 def test_issue1963(en_tokenizer):
     """Test that doc.merge() resizes doc.tensor"""
     doc = en_tokenizer("a b c d")
@@ -275,6 +297,7 @@ def test_issue1963(en_tokenizer):
 
 
 @pytest.mark.parametrize("label", ["U-JOB-NAME"])
+@pytest.mark.issue(1967)
 def test_issue1967(label):
     nlp = Language()
     config = {}
@@ -293,6 +316,7 @@ def test_issue1967(label):
     assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
 
 
+@pytest.mark.issue(1971)
 def test_issue1971(en_vocab):
     # Possibly related to #2675 and #2671?
     matcher = Matcher(en_vocab)
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 09baab4d8..a07360c2c 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -13,6 +13,7 @@ from ..util import add_vecs_to_vocab
 @pytest.mark.skip(
     reason="Can not be fixed without iterative looping between prefix/suffix and infix"
 )
+@pytest.mark.issue(2070)
 def test_issue2070():
     """Test that checks that a dot followed by a quote is handled
     appropriately.
@@ -25,6 +26,7 @@ def test_issue2070():
     assert len(doc) == 11
 
 
+@pytest.mark.issue(2179)
 def test_issue2179():
     """Test that spurious 'extra_labels' aren't created when initializing NER."""
     nlp = Italian()
@@ -41,6 +43,7 @@ def test_issue2179():
     assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
 
 
+@pytest.mark.issue(2203)
 def test_issue2203(en_vocab):
     """Test that lemmas are set correctly in doc.from_array."""
     words = ["I", "'ll", "survive"]
@@ -61,6 +64,7 @@ def test_issue2203(en_vocab):
     assert [t.lemma_ for t in new_doc] == lemmas
 
 
+@pytest.mark.issue(2219)
 def test_issue2219(en_vocab):
     vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
     add_vecs_to_vocab(en_vocab, vectors)
@@ -69,6 +73,7 @@ def test_issue2219(en_vocab):
     assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
 
 
+@pytest.mark.issue(2361)
 def test_issue2361(de_vocab):
     chars = ("&lt;", "&gt;", "&amp;", "&quot;")
     words = ["<", ">", "&", '"']
@@ -78,6 +83,7 @@ def test_issue2361(de_vocab):
         assert char in html
 
 
+@pytest.mark.issue(2385)
 def test_issue2385():
     """Test that IOB tags are correctly converted to BILUO tags."""
     # fix bug in labels with a 'b' character
@@ -99,11 +105,13 @@ def test_issue2385():
         ("U-BRAWLER", "U-BRAWLER"),
     ],
 )
+@pytest.mark.issue(2385)
 def test_issue2385_biluo(tags):
     """Test that BILUO-compatible tags aren't modified."""
     assert iob_to_biluo(tags) == list(tags)
 
 
+@pytest.mark.issue(2396)
 def test_issue2396(en_vocab):
     words = ["She", "created", "a", "test", "for", "spacy"]
     heads = [1, 1, 3, 1, 3, 4]
@@ -125,6 +133,7 @@ def test_issue2396(en_vocab):
     assert (span.get_lca_matrix() == matrix).all()
 
 
+@pytest.mark.issue(2464)
 def test_issue2464(en_vocab):
     """Test problem with successive ?. This is the same bug, so putting it here."""
     matcher = Matcher(en_vocab)
@@ -134,6 +143,7 @@ def test_issue2464(en_vocab):
     assert len(matches) == 3
 
 
+@pytest.mark.issue(2482)
 def test_issue2482():
     """Test we can serialize and deserialize a blank NER or parser model."""
     nlp = Italian()
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 4952a545d..cbb7f0621 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -13,6 +13,7 @@ import numpy
 import random
 
 
+@pytest.mark.issue(2564)
 def test_issue2564():
     """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
     nlp = Language()
@@ -26,6 +27,7 @@ def test_issue2564():
     assert piped_doc.has_annotation("TAG")
 
 
+@pytest.mark.issue(2569)
 def test_issue2569(en_tokenizer):
     """Test that operator + is greedy."""
     doc = en_tokenizer("It is May 15, 1993.")
@@ -46,12 +48,14 @@ def test_issue2569(en_tokenizer):
         "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:",
     ],
 )
+@pytest.mark.issue(2626)
 def test_issue2626_2835(en_tokenizer, text):
     """Check that sentence doesn't cause an infinite loop in the tokenizer."""
     doc = en_tokenizer(text)
     assert doc
 
 
+@pytest.mark.issue(2656)
 def test_issue2656(en_tokenizer):
     """Test that tokenizer correctly splits off punctuation after numbers with
     decimal points.
@@ -71,6 +75,7 @@ def test_issue2656(en_tokenizer):
     assert doc[10].text == "."
 
 
+@pytest.mark.issue(2671)
 def test_issue2671():
     """Ensure the correct entity ID is returned for matches with quantifiers.
     See also #2675
@@ -94,6 +99,7 @@ def test_issue2671():
         assert nlp.vocab.strings[match_id] == pattern_id
 
 
+@pytest.mark.issue(2728)
 def test_issue2728(en_vocab):
     """Test that displaCy ENT visualizer escapes HTML correctly."""
     doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
@@ -105,6 +111,7 @@ def test_issue2728(en_vocab):
     assert "&lt;RELEASE&gt;" in html
 
 
+@pytest.mark.issue(2754)
 def test_issue2754(en_tokenizer):
     """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
     a = en_tokenizer("a")
@@ -113,6 +120,7 @@ def test_issue2754(en_tokenizer):
     assert am[0].norm_ == "am"
 
 
+@pytest.mark.issue(2772)
 def test_issue2772(en_vocab):
     """Test that deprojectivization doesn't mess up sentence boundaries."""
     # fmt: off
@@ -128,6 +136,7 @@ def test_issue2772(en_vocab):
 
 @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
 @pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
+@pytest.mark.issue(2782)
 def test_issue2782(text, lang_cls):
     """Check that like_num handles + and - before number."""
     nlp = lang_cls()
@@ -136,6 +145,7 @@ def test_issue2782(text, lang_cls):
     assert doc[0].like_num
 
 
+@pytest.mark.issue(2800)
 def test_issue2800():
     """Test issue that arises when too many labels are added to NER model.
     Used to cause segfault.
@@ -157,6 +167,7 @@ def test_issue2800():
             nlp.update([example], sgd=optimizer, losses=losses, drop=0.5)
 
 
+@pytest.mark.issue(2822)
 def test_issue2822(it_tokenizer):
     """Test that the abbreviation of poco is kept as one word."""
     doc = it_tokenizer("Vuoi un po' di zucchero?")
@@ -169,6 +180,7 @@ def test_issue2822(it_tokenizer):
     assert doc[5].text == "?"
 
 
+@pytest.mark.issue(2833)
 def test_issue2833(en_vocab):
     """Test that a custom error is raised if a token or span is pickled."""
     doc = Doc(en_vocab, words=["Hello", "world"])
@@ -178,6 +190,7 @@ def test_issue2833(en_vocab):
         pickle.dumps(doc[0:2])
 
 
+@pytest.mark.issue(2871)
 def test_issue2871():
     """Test that vectors recover the correct key for spaCy reserved words."""
     words = ["dog", "cat", "SUFFIX"]
@@ -196,6 +209,7 @@ def test_issue2871():
     assert vocab.vectors.find(key="SUFFIX") == 2
 
 
+@pytest.mark.issue(2901)
 def test_issue2901():
     """Test that `nlp` doesn't fail."""
     try:
@@ -207,6 +221,7 @@ def test_issue2901():
     assert doc
 
 
+@pytest.mark.issue(2926)
 def test_issue2926(fr_tokenizer):
     """Test that the tokenizer correctly splits tokens separated by a slash (/)
     ending in a digit.
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index e123d2df9..6220003dc 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -14,6 +14,7 @@ from spacy.vectors import Vectors
 import numpy
 
 
+@pytest.mark.issue(3002)
 def test_issue3002():
     """Test that the tokenizer doesn't hang on a long list of dots"""
     nlp = German()
@@ -23,6 +24,7 @@ def test_issue3002():
     assert len(doc) == 5
 
 
+@pytest.mark.issue(3009)
 def test_issue3009(en_vocab):
     """Test problem with matcher quantifiers"""
     patterns = [
@@ -53,6 +55,7 @@ def test_issue3009(en_vocab):
         assert matches
 
 
+@pytest.mark.issue(3012)
 def test_issue3012(en_vocab):
     """Test that the is_tagged attribute doesn't get overwritten when we from_array
     without tag information."""
@@ -74,6 +77,7 @@ def test_issue3012(en_vocab):
     assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
 
 
+@pytest.mark.issue(3199)
 def test_issue3199():
     """Test that Span.noun_chunks works correctly if no noun chunks iterator
     is available. To make this test future-proof, we're constructing a Doc
@@ -85,6 +89,7 @@ def test_issue3199():
         list(doc[0:3].noun_chunks)
 
 
+@pytest.mark.issue(3209)
 def test_issue3209():
     """Test issue that occurred in spaCy nightly where NER labels were being
     mapped to classes incorrectly after loading the model, when the labels
@@ -104,6 +109,7 @@ def test_issue3209():
     assert ner2.move_names == move_names
 
 
+@pytest.mark.issue(3248)
 def test_issue3248_1():
     """Test that the PhraseMatcher correctly reports its number of rules, not
     total number of patterns."""
@@ -114,6 +120,7 @@ def test_issue3248_1():
     assert len(matcher) == 2
 
 
+@pytest.mark.issue(3248)
 def test_issue3248_2():
     """Test that the PhraseMatcher can be pickled correctly."""
     nlp = English()
@@ -125,6 +132,7 @@ def test_issue3248_2():
     assert len(new_matcher) == len(matcher)
 
 
+@pytest.mark.issue(3277)
 def test_issue3277(es_tokenizer):
     """Test that hyphens are split correctly as prefixes."""
     doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
@@ -134,6 +142,7 @@ def test_issue3277(es_tokenizer):
     assert doc[9].text == "\u2013"
 
 
+@pytest.mark.issue(3288)
 def test_issue3288(en_vocab):
     """Test that retokenization works correctly via displaCy when punctuation
     is merged onto the preceeding token and tensor is resized."""
@@ -145,6 +154,7 @@ def test_issue3288(en_vocab):
     displacy.render(doc)
 
 
+@pytest.mark.issue(3289)
 def test_issue3289():
     """Test that Language.to_bytes handles serializing a pipeline component
     with an uninitialized model."""
@@ -156,6 +166,7 @@ def test_issue3289():
     new_nlp.from_bytes(bytes_data)
 
 
+@pytest.mark.issue(3328)
 def test_issue3328(en_vocab):
     doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
     matcher = Matcher(en_vocab)
@@ -170,6 +181,7 @@ def test_issue3328(en_vocab):
     assert matched_texts == ["Hello", "how", "you", "doing"]
 
 
+@pytest.mark.issue(3331)
 def test_issue3331(en_vocab):
     """Test that duplicate patterns for different rules result in multiple
     matches, one per rule.
@@ -184,6 +196,7 @@ def test_issue3331(en_vocab):
     assert sorted(match_ids) == ["A", "B"]
 
 
+@pytest.mark.issue(3345)
 def test_issue3345():
     """Test case where preset entity crosses sentence boundary."""
     nlp = English()
@@ -206,6 +219,7 @@ def test_issue3345():
     assert ner.moves.is_valid(state, "B-GPE")
 
 
+@pytest.mark.issue(3412)
 def test_issue3412():
     data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
     vectors = Vectors(data=data, keys=["A", "B", "C"])
@@ -216,6 +230,7 @@ def test_issue3412():
 
 
 @pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
+@pytest.mark.issue(3449)
 def test_issue3449():
     nlp = English()
     nlp.add_pipe("sentencizer")
@@ -230,6 +245,7 @@ def test_issue3449():
     assert t3[5].text == "I"
 
 
+@pytest.mark.issue(3456)
 def test_issue3456():
     # this crashed because of a padding error in layer.ops.unflatten in thinc
     nlp = English()
@@ -239,6 +255,7 @@ def test_issue3456():
     list(nlp.pipe(["hi", ""]))
 
 
+@pytest.mark.issue(3468)
 def test_issue3468():
     """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
     be restored after serialization."""
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index 71c3768dd..5d9bc4e83 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -24,6 +24,7 @@ from ..util import make_tempdir
 
 
 @pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
+@pytest.mark.issue(3521)
 def test_issue3521(en_tokenizer, word):
     tok = en_tokenizer(word)[1]
     # 'not' and 'would' should be stopwords, also in their abbreviated forms
@@ -108,6 +109,7 @@ def test_issue_3526_4(en_vocab):
         assert new_ruler.overwrite is True
 
 
+@pytest.mark.issue(3531)
 def test_issue3531():
     """Test that displaCy renderer doesn't require "settings" key."""
     example_dep = {
@@ -137,6 +139,7 @@ def test_issue3531():
     assert ent_html
 
 
+@pytest.mark.issue(3540)
 def test_issue3540(en_vocab):
     words = ["I", "live", "in", "NewYork", "right", "now"]
     tensor = numpy.asarray(
@@ -176,6 +179,7 @@ def test_issue3540(en_vocab):
     assert vectors_1[5].tolist() == vectors_2[6].tolist()
 
 
+@pytest.mark.issue(3549)
 def test_issue3549(en_vocab):
     """Test that match pattern validation doesn't raise on empty errors."""
     matcher = Matcher(en_vocab, validate=True)
@@ -186,6 +190,7 @@ def test_issue3549(en_vocab):
 
 
 @pytest.mark.skip("Matching currently only works on strings and integers")
+@pytest.mark.issue(3555)
 def test_issue3555(en_vocab):
     """Test that custom extensions with default None don't break matcher."""
     Token.set_extension("issue3555", default=None)
@@ -196,6 +201,7 @@ def test_issue3555(en_vocab):
     matcher(doc)
 
 
+@pytest.mark.issue(3611)
 def test_issue3611():
     """Test whether adding n-grams in the textcat works even when n > token length of some docs"""
     unique_classes = ["offensive", "inoffensive"]
@@ -232,6 +238,7 @@ def test_issue3611():
                 nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
 
 
+@pytest.mark.issue(3625)
 def test_issue3625():
     """Test that default punctuation rules applies to hindi unicode characters"""
     nlp = Hindi()
@@ -240,6 +247,7 @@ def test_issue3625():
     assert [token.text for token in doc] == expected
 
 
+@pytest.mark.issue(3803)
 def test_issue3803():
     """Test that spanish num-like tokens have True for like_num attribute."""
     nlp = Spanish()
@@ -255,6 +263,7 @@ def _parser_example(parser):
     return Example.from_dict(doc, gold)
 
 
+@pytest.mark.issue(3830)
 def test_issue3830_no_subtok():
     """Test that the parser doesn't have subtok label if not learn_tokens"""
     config = {
@@ -268,6 +277,7 @@ def test_issue3830_no_subtok():
     assert "subtok" not in parser.labels
 
 
+@pytest.mark.issue(3830)
 def test_issue3830_with_subtok():
     """Test that the parser does have subtok label if learn_tokens=True."""
     config = {
@@ -281,6 +291,7 @@ def test_issue3830_with_subtok():
     assert "subtok" in parser.labels
 
 
+@pytest.mark.issue(3839)
 def test_issue3839(en_vocab):
     """Test that match IDs returned by the matcher are correct, are in the string"""
     doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
@@ -307,6 +318,7 @@ def test_issue3839(en_vocab):
         "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
     ],
 )
+@pytest.mark.issue(3869)
 def test_issue3869(sentence):
     """Test that the Doc's count_by function works consistently"""
     nlp = English()
@@ -317,6 +329,7 @@ def test_issue3869(sentence):
     assert count == doc.count_by(IS_ALPHA).get(1, 0)
 
 
+@pytest.mark.issue(3879)
 def test_issue3879(en_vocab):
     doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
     assert len(doc) == 5
@@ -326,6 +339,7 @@ def test_issue3879(en_vocab):
     assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
 
 
+@pytest.mark.issue(3880)
 def test_issue3880():
     """Test that `nlp.pipe()` works when an empty string ends the batch.
 
@@ -341,6 +355,7 @@ def test_issue3880():
         pass
 
 
+@pytest.mark.issue(3882)
 def test_issue3882(en_vocab):
     """Test that displaCy doesn't serialize the doc.user_data when making a
     copy of the Doc.
@@ -350,6 +365,7 @@ def test_issue3882(en_vocab):
     parse_deps(doc)
 
 
+@pytest.mark.issue(3951)
 def test_issue3951(en_vocab):
     """Test that combinations of optional rules are matched correctly."""
     matcher = Matcher(en_vocab)
@@ -365,6 +381,7 @@ def test_issue3951(en_vocab):
     assert len(matches) == 0
 
 
+@pytest.mark.issue(3959)
 def test_issue3959():
     """Ensure that a modified pos attribute is serialized correctly."""
     nlp = English()
@@ -383,6 +400,7 @@ def test_issue3959():
         assert doc2[0].pos_ == "NOUN"
 
 
+@pytest.mark.issue(3962)
 def test_issue3962(en_vocab):
     """Ensure that as_doc does not result in out-of-bound access of tokens.
     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
@@ -421,6 +439,7 @@ def test_issue3962(en_vocab):
     assert len(list(doc3.sents)) == 1
 
 
+@pytest.mark.issue(3962)
 def test_issue3962_long(en_vocab):
     """Ensure that as_doc does not result in out-of-bound access of tokens.
     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
@@ -456,6 +475,7 @@ def test_issue3962_long(en_vocab):
     assert sents[1].text == "They never"
 
 
+@pytest.mark.issue(3972)
 def test_issue3972(en_vocab):
     """Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
     matcher = PhraseMatcher(en_vocab)
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 4410e6236..7b7c304a3 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -17,6 +17,7 @@ from thinc.api import compounding
 from ..util import make_tempdir
 
 
+@pytest.mark.issue(4002)
 def test_issue4002(en_vocab):
     """Test that the PhraseMatcher can match on overwritten NORM attributes."""
     matcher = PhraseMatcher(en_vocab, attr="NORM")
@@ -37,6 +38,7 @@ def test_issue4002(en_vocab):
     assert len(matches) == 1
 
 
+@pytest.mark.issue(4030)
 def test_issue4030():
     """Test whether textcat works fine with empty doc"""
     unique_classes = ["offensive", "inoffensive"]
@@ -77,6 +79,7 @@ def test_issue4030():
     assert doc.cats["inoffensive"] == 0.0
 
 
+@pytest.mark.issue(4042)
 def test_issue4042():
     """Test that serialization of an EntityRuler before NER works fine."""
     nlp = English()
@@ -105,6 +108,7 @@ def test_issue4042():
         assert doc2.ents[0].label_ == "MY_ORG"
 
 
+@pytest.mark.issue(4042)
 def test_issue4042_bug2():
     """
     Test that serialization of an NER works fine when new labels were added.
@@ -139,6 +143,7 @@ def test_issue4042_bug2():
         assert len(ner2.labels) == 2
 
 
+@pytest.mark.issue(4054)
 def test_issue4054(en_vocab):
     """Test that a new blank model can be made with a vocab from file,
     and that serialization does not drop the language at any point."""
@@ -159,6 +164,7 @@ def test_issue4054(en_vocab):
         assert nlp3.lang == "en"
 
 
+@pytest.mark.issue(4120)
 def test_issue4120(en_vocab):
     """Test that matches without a final {OP: ?} token are returned."""
     matcher = Matcher(en_vocab)
@@ -177,6 +183,7 @@ def test_issue4120(en_vocab):
     assert len(matcher(doc4)) == 3  # fixed
 
 
+@pytest.mark.issue(4133)
 def test_issue4133(en_vocab):
     nlp = English()
     vocab_bytes = nlp.vocab.to_bytes()
@@ -196,6 +203,7 @@ def test_issue4133(en_vocab):
     assert actual == pos
 
 
+@pytest.mark.issue(4190)
 def test_issue4190():
     def customize_tokenizer(nlp):
         prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
@@ -236,6 +244,7 @@ def test_issue4190():
     assert result_1b == result_2
 
 
+@pytest.mark.issue(4267)
 def test_issue4267():
     """Test that running an entity_ruler after ner gives consistent results"""
     nlp = English()
@@ -262,6 +271,7 @@ def test_issue4267():
 
 
 @pytest.mark.skip(reason="lemmatizer lookups no longer in vocab")
+@pytest.mark.issue(4272)
 def test_issue4272():
     """Test that lookup table can be accessed from Token.lemma if no POS tags
     are available."""
@@ -287,6 +297,7 @@ def test_multiple_predictions():
     dummy_pipe(doc)
 
 
+@pytest.mark.issue(4313)
 def test_issue4313():
     """This should not crash or exit with some strange error code"""
     beam_width = 16
@@ -313,6 +324,7 @@ def test_issue4313():
     assert "MY_ORG" in ner.labels
 
 
+@pytest.mark.issue(4348)
 def test_issue4348():
     """Test that training the tagger with empty data, doesn't throw errors"""
     nlp = English()
@@ -328,6 +340,7 @@ def test_issue4348():
             nlp.update(batch, sgd=optimizer, losses=losses)
 
 
+@pytest.mark.issue(4367)
 def test_issue4367():
     """Test that docbin init goes well"""
     DocBin()
@@ -335,6 +348,7 @@ def test_issue4367():
     DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
 
 
+@pytest.mark.issue(4373)
 def test_issue4373():
     """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
     matcher = Matcher(Vocab())
@@ -343,6 +357,7 @@ def test_issue4373():
     assert isinstance(matcher.vocab, Vocab)
 
 
+@pytest.mark.issue(4402)
 def test_issue4402():
     json_data = {
         "id": 0,
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
index effd67306..07a00d2b7 100644
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -14,6 +14,7 @@ from thinc.api import NumpyOps, get_current_ops
 from ..util import make_tempdir
 
 
+@pytest.mark.issue(4528)
 def test_issue4528(en_vocab):
     """Test that user_data is correctly serialized in DocBin."""
     doc = Doc(en_vocab, words=["hello", "world"])
@@ -37,6 +38,7 @@ def test_gold_misaligned(en_tokenizer, text, words):
     Example.from_dict(doc, {"words": words})
 
 
+@pytest.mark.issue(4651)
 def test_issue4651_with_phrase_matcher_attr():
     """Test that the EntityRuler PhraseMatcher is deserialized correctly using
     the method from_disk when the EntityRuler argument phrase_matcher_attr is
@@ -59,6 +61,7 @@ def test_issue4651_with_phrase_matcher_attr():
     assert res == res_reloaded
 
 
+@pytest.mark.issue(4651)
 def test_issue4651_without_phrase_matcher_attr():
     """Test that the EntityRuler PhraseMatcher is deserialized correctly using
     the method from_disk when the EntityRuler argument phrase_matcher_attr is
@@ -81,6 +84,7 @@ def test_issue4651_without_phrase_matcher_attr():
     assert res == res_reloaded
 
 
+@pytest.mark.issue(4665)
 def test_issue4665():
     """
     conllu_to_docs should not raise an exception if the HEAD column contains an
@@ -109,6 +113,7 @@ def test_issue4665():
     conllu_to_docs(input_data)
 
 
+@pytest.mark.issue(4674)
 def test_issue4674():
     """Test that setting entities with overlapping identifiers does not mess up IO"""
     nlp = English()
@@ -135,6 +140,7 @@ def test_issue4674():
 
 
 @pytest.mark.skip(reason="API change: disable just disables, new exclude arg")
+@pytest.mark.issue(4707)
 def test_issue4707():
     """Tests that disabled component names are also excluded from nlp.from_disk
     by default when loading a model.
@@ -151,6 +157,7 @@ def test_issue4707():
     assert "entity_ruler" in new_nlp.pipe_names
 
 
+@pytest.mark.issue(4725)
 def test_issue4725_1():
     """Ensure the pickling of the NER goes well"""
     vocab = Vocab(vectors_name="test_vocab_add_vector")
@@ -169,6 +176,7 @@ def test_issue4725_1():
             assert ner2.cfg["update_with_oracle_cut_size"] == 111
 
 
+@pytest.mark.issue(4725)
 def test_issue4725_2():
     if isinstance(get_current_ops, NumpyOps):
         # ensures that this runs correctly and doesn't hang or crash because of the global vectors
@@ -188,6 +196,7 @@ def test_issue4725_2():
             pass
 
 
+@pytest.mark.issue(4849)
 def test_issue4849():
     nlp = English()
     patterns = [
@@ -235,6 +244,7 @@ class CustomPipe:
         return str(span.end)
 
 
+@pytest.mark.issue(4903)
 def test_issue4903():
     """Ensure that this runs correctly and doesn't hang or crash on Windows /
     macOS."""
@@ -249,6 +259,7 @@ def test_issue4903():
         assert docs[2].text == "No, I prefer wasabi."
 
 
+@pytest.mark.issue(4924)
 def test_issue4924():
     nlp = Language()
     example = Example.from_dict(nlp.make_doc(""), {})
diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py
index bc9bcb982..e1f5231e7 100644
--- a/spacy/tests/regression/test_issue5001-5500.py
+++ b/spacy/tests/regression/test_issue5001-5500.py
@@ -12,6 +12,7 @@ import pytest
 from ...util import make_tempdir
 
 
+@pytest.mark.issue(5048)
 def test_issue5048(en_vocab):
     words = ["This", "is", "a", "sentence"]
     pos_s = ["DET", "VERB", "DET", "NOUN"]
@@ -34,6 +35,7 @@ def test_issue5048(en_vocab):
     assert v1 == v2
 
 
+@pytest.mark.issue(5082)
 def test_issue5082():
     # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
     nlp = English()
@@ -68,6 +70,7 @@ def test_issue5082():
     numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34)
 
 
+@pytest.mark.issue(5137)
 def test_issue5137():
     factory_name = "test_issue5137"
     pipe_name = "my_component"
@@ -98,6 +101,7 @@ def test_issue5137():
         assert nlp2.get_pipe(pipe_name).categories == "my_categories"
 
 
+@pytest.mark.issue(5141)
 def test_issue5141(en_vocab):
     """Ensure an empty DocBin does not crash on serialization"""
     doc_bin = DocBin(attrs=["DEP", "HEAD"])
@@ -107,6 +111,7 @@ def test_issue5141(en_vocab):
     assert list(doc_bin_2.get_docs(en_vocab)) == []
 
 
+@pytest.mark.issue(5152)
 def test_issue5152():
     # Test that the comparison between a Span and a Token, goes well
     # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@@ -125,6 +130,7 @@ def test_issue5152():
         assert span_2.similarity(span_3) < 1.0
 
 
+@pytest.mark.issue(5458)
 def test_issue5458():
     # Test that the noun chuncker does not generate overlapping spans
     # fmt: off
diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py
index 355ffffeb..87c40ec2a 100644
--- a/spacy/tests/regression/test_issue5501-6000.py
+++ b/spacy/tests/regression/test_issue5501-6000.py
@@ -25,6 +25,7 @@ from spacy.training import Example
         multi_label_cnn_config,
     ],
 )
+@pytest.mark.issue(5551)
 def test_issue5551(textcat_config):
     """Test that after fixing the random seed, the results of the pipeline are truly identical"""
     component = "textcat"
@@ -53,6 +54,7 @@ def test_issue5551(textcat_config):
     assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
 
 
+@pytest.mark.issue(5838)
 def test_issue5838():
     # Displacy's EntityRenderer break line
     # not working after last entity
@@ -65,6 +67,7 @@ def test_issue5838():
     assert found == 4
 
 
+@pytest.mark.issue(5918)
 def test_issue5918():
     # Test edge case when merging entities.
     nlp = English()
diff --git a/spacy/tests/regression/test_issue6001-6500.py b/spacy/tests/regression/test_issue6001-6500.py
index 470b2f388..cb27d39e4 100644
--- a/spacy/tests/regression/test_issue6001-6500.py
+++ b/spacy/tests/regression/test_issue6001-6500.py
@@ -4,6 +4,7 @@ from spacy.schemas import TokenPattern, TokenPatternSchema
 import pytest
 
 
+@pytest.mark.issue(6207)
 def test_issue6207(en_tokenizer):
     doc = en_tokenizer("zero one two three four five six")
 
@@ -18,6 +19,7 @@ def test_issue6207(en_tokenizer):
     assert s3 in result
 
 
+@pytest.mark.issue(6258)
 def test_issue6258():
     """Test that the non-empty constraint pattern field is respected"""
     # These one is valid
diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py
index f57e4085c..84517d79b 100644
--- a/spacy/tests/regression/test_issue6501-7000.py
+++ b/spacy/tests/regression/test_issue6501-7000.py
@@ -13,6 +13,7 @@ import pickle
 from ..util import make_tempdir
 
 
+@pytest.mark.issue(6730)
 def test_issue6730(en_vocab):
     """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
     from spacy.kb import KnowledgeBase
@@ -34,6 +35,7 @@ def test_issue6730(en_vocab):
     assert set(kb.get_alias_strings()) == {"x", "y"}
 
 
+@pytest.mark.issue(6755)
 def test_issue6755(en_tokenizer):
     doc = en_tokenizer("This is a magnificent sentence.")
     span = doc[:0]
@@ -45,6 +47,7 @@ def test_issue6755(en_tokenizer):
     "sentence, start_idx,end_idx,label",
     [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
 )
+@pytest.mark.issue(6815)
 def test_issue6815_1(sentence, start_idx, end_idx, label):
     nlp = English()
     doc = nlp(sentence)
@@ -55,6 +58,7 @@ def test_issue6815_1(sentence, start_idx, end_idx, label):
 @pytest.mark.parametrize(
     "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
 )
+@pytest.mark.issue(6815)
 def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
     nlp = English()
     doc = nlp(sentence)
@@ -66,6 +70,7 @@ def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
     "sentence, start_idx,end_idx,vector",
     [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
 )
+@pytest.mark.issue(6815)
 def test_issue6815_3(sentence, start_idx, end_idx, vector):
     nlp = English()
     doc = nlp(sentence)
@@ -73,6 +78,7 @@ def test_issue6815_3(sentence, start_idx, end_idx, vector):
     assert (span.vector == vector).all()
 
 
+@pytest.mark.issue(6839)
 def test_issue6839(en_vocab):
     """Ensure that PhraseMatcher accepts Span as input"""
     # fmt: off
@@ -155,6 +161,7 @@ labels = ['label1', 'label2']
     "component_name",
     ["textcat", "textcat_multilabel"],
 )
+@pytest.mark.issue(6908)
 def test_issue6908(component_name):
     """Test intializing textcat with labels in a list"""
 
@@ -219,6 +226,7 @@ upstream = "*"
 """
 
 
+@pytest.mark.issue(6950)
 def test_issue6950():
     """Test that the nlp object with initialized tok2vec with listeners pickles
     correctly (and doesn't have lambdas).
diff --git a/spacy/tests/regression/test_issue7001-8000.py b/spacy/tests/regression/test_issue7001-8000.py
index 5bb7cc08e..17b8a6839 100644
--- a/spacy/tests/regression/test_issue7001-8000.py
+++ b/spacy/tests/regression/test_issue7001-8000.py
@@ -13,6 +13,7 @@ from wasabi import msg
 from ..util import make_tempdir
 
 
+@pytest.mark.issue(7019)
 def test_issue7019():
     scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None}
     print_textcats_auc_per_cat(msg, scores)
@@ -64,6 +65,7 @@ upstream = "*"
 """
 
 
+@pytest.mark.issue(7029)
 def test_issue7029():
     """Test that an empty document doesn't mess up an entire batch."""
     TRAIN_DATA = [
@@ -84,6 +86,7 @@ def test_issue7029():
     assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
 
 
+@pytest.mark.issue(7055)
 def test_issue7055():
     """Test that fill-config doesn't turn sourced components into factories."""
     source_cfg = {
@@ -118,6 +121,7 @@ def test_issue7055():
     assert "model" in filled_cfg["components"]["ner"]
 
 
+@pytest.mark.issue(7056)
 def test_issue7056():
     """Test that the Unshift transition works properly, and doesn't cause
     sentence segmentation errors."""
@@ -190,6 +194,7 @@ def test_partial_links():
     assert "ORG" not in results["nel_f_per_type"]
 
 
+@pytest.mark.issue(7065)
 def test_issue7065():
     text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."
     nlp = English()
@@ -217,6 +222,7 @@ def test_issue7065():
     assert sentences.index(ent.sent) == 0
 
 
+@pytest.mark.issue(7065)
 def test_issue7065_b():
     # Test that the NEL doesn't crash when an entity crosses a sentence boundary
     nlp = English()
diff --git a/spacy/tests/regression/test_issue7716.py b/spacy/tests/regression/test_issue7716.py
index 811952792..d9b3967ff 100644
--- a/spacy/tests/regression/test_issue7716.py
+++ b/spacy/tests/regression/test_issue7716.py
@@ -43,6 +43,7 @@ def parser(vocab):
     return parser
 
 
+@pytest.mark.issue(7716)
 @pytest.mark.xfail(reason="Not fixed yet")
 def test_partial_annotation(parser):
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py
index 6ddbe53e0..1168630b6 100644
--- a/spacy/tests/regression/test_issue8190.py
+++ b/spacy/tests/regression/test_issue8190.py
@@ -3,6 +3,7 @@ from spacy.lang.en import English
 from ..util import make_tempdir
 
 
+@pytest.mark.issue(8190)
 def test_issue8190():
     """Test that config overrides are not lost after load is complete."""
     source_cfg = {
diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py
index 00cd6da3b..0370074fe 100644
--- a/spacy/tests/regression/test_issue8216.py
+++ b/spacy/tests/regression/test_issue8216.py
@@ -22,6 +22,7 @@ def patterns():
     ]
 
 
+@pytest.mark.issue(8216)
 def test_entity_ruler_fix8216(nlp, patterns):
     """Test that patterns don't get added excessively."""
     ruler = nlp.add_pipe("entity_ruler", config={"validate": True})

From 8e7deaf210988ed87f72144dc7a75f9c27885f41 Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Fri, 5 Nov 2021 10:49:48 +0800
Subject: [PATCH 062/133] Add missing imports in some regression tests

- test_issue7001-8000.py
- test_issue8190.py
---
 spacy/tests/regression/test_issue7001-8000.py | 1 +
 spacy/tests/regression/test_issue8190.py      | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/spacy/tests/regression/test_issue7001-8000.py b/spacy/tests/regression/test_issue7001-8000.py
index 17b8a6839..1164e85b9 100644
--- a/spacy/tests/regression/test_issue7001-8000.py
+++ b/spacy/tests/regression/test_issue7001-8000.py
@@ -1,3 +1,4 @@
+import pytest
 from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type
 from spacy.lang.en import English
 from spacy.training import Example
diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py
index 1168630b6..0b2f2824b 100644
--- a/spacy/tests/regression/test_issue8190.py
+++ b/spacy/tests/regression/test_issue8190.py
@@ -1,3 +1,5 @@
+import pytest
+
 import spacy
 from spacy.lang.en import English
 from ..util import make_tempdir

From e6f91b6f276e90c32253d21ada29cc20108d1170 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 5 Nov 2021 09:56:26 +0100
Subject: [PATCH 063/133] Format (#9630)

---
 spacy/lang/ja/__init__.py               |  6 +++++-
 spacy/lang/ru/lemmatizer.py             |  4 +++-
 spacy/lang/ti/lex_attrs.py              |  4 ++--
 spacy/lang/uk/lemmatizer.py             |  4 +++-
 spacy/lang/vi/examples.py               |  1 -
 spacy/scorer.py                         |  4 +++-
 spacy/tests/lang/ja/test_tokenizer.py   |  6 +-----
 spacy/tests/tokenizer/test_tokenizer.py |  4 ++--
 spacy/tests/training/test_training.py   |  3 +++
 spacy/training/pretrain.py              |  4 +++-
 spacy/util.py                           | 17 ++++++++---------
 11 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 81ff5b5b8..bf86305fb 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -203,7 +203,11 @@ class Japanese(Language):
         "extend": True,
         "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
     },
-    default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
+    default_score_weights={
+        "pos_acc": 0.5,
+        "morph_micro_f": 0.5,
+        "morph_per_feat": None,
+    },
 )
 def make_morphologizer(
     nlp: Language,
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 2fc3a471b..85180b1e4 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -33,7 +33,9 @@ class RussianLemmatizer(Lemmatizer):
                 ) from None
             if getattr(self, "_morph", None) is None:
                 self._morph = MorphAnalyzer()
-        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
+        super().__init__(
+            vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+        )
 
     def pymorphy2_lemmatize(self, token: Token) -> List[str]:
         string = token.text
diff --git a/spacy/lang/ti/lex_attrs.py b/spacy/lang/ti/lex_attrs.py
index b29bd8c96..da56af6c0 100644
--- a/spacy/lang/ti/lex_attrs.py
+++ b/spacy/lang/ti/lex_attrs.py
@@ -27,7 +27,7 @@ _num_words = [
     "ትሪልዮን",
     "ኳድሪልዮን",
     "ጋዚልዮን",
-    "ባዚልዮን"
+    "ባዚልዮን",
 ]
 
 # Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
@@ -41,7 +41,7 @@ _ordinal_words = [
     "ሻውዓይ",
     "ሻምናይ",
     "ታሽዓይ",
-    "ዓስራይ"
+    "ዓስራይ",
 ]
 
 
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index fd566a3a8..a8bc56057 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -29,4 +29,6 @@ class UkrainianLemmatizer(RussianLemmatizer):
                 ) from None
             if getattr(self, "_morph", None) is None:
                 self._morph = MorphAnalyzer(lang="uk")
-        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
+        super().__init__(
+            vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+        )
diff --git a/spacy/lang/vi/examples.py b/spacy/lang/vi/examples.py
index 86d0b50b8..36575f67c 100644
--- a/spacy/lang/vi/examples.py
+++ b/spacy/lang/vi/examples.py
@@ -1,4 +1,3 @@
-
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.vi.examples import sentences
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 75e5b3317..cfdf34e62 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -303,7 +303,9 @@ class Scorer:
                                     pred_per_feat[field] = set()
                                 pred_per_feat[field].add((gold_i, feat))
             for field in per_feat:
-                micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
+                micro_score.score_set(
+                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
+                )
                 per_feat[field].score_set(
                     pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
                 )
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index 098884cf0..3437ea283 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -133,11 +133,7 @@ def test_ja_tokenizer_sub_tokens(
             (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
             (["トッ"], ["テ"], ["ツケ"], ["タ"]),
         ),
-        (
-            "2=3",
-            ([], [], []),
-            (["ニ"], ["_"], ["サン"])
-        ),
+        ("2=3", ([], [], []), (["ニ"], ["_"], ["サン"])),
     ],
 )
 def test_ja_tokenizer_inflections_reading_forms(
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 192faa67b..452bcc079 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -216,8 +216,8 @@ def test_tokenizer_flush_specials(en_vocab):
 
 def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
     # the prefix and suffix matches overlap in the suffix lookbehind
-    prefixes = ['a(?=.)']
-    suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.']
+    prefixes = ["a(?=.)"]
+    suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."]
     prefix_re = compile_prefix_regex(prefixes)
     suffix_re = compile_suffix_regex(suffixes)
     tokenizer = Tokenizer(
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 48636a4eb..68f86190b 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -524,6 +524,7 @@ def test_roundtrip_docs_to_docbin(doc):
     assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
     assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
 
+
 def test_docbin_user_data_serialized(doc):
     doc.user_data["check"] = True
     nlp = English()
@@ -536,6 +537,7 @@ def test_docbin_user_data_serialized(doc):
 
     assert reloaded_doc.user_data["check"] == True
 
+
 def test_docbin_user_data_not_serialized(doc):
     # this isn't serializable, but that shouldn't cause an error
     doc.user_data["check"] = set()
@@ -549,6 +551,7 @@ def test_docbin_user_data_not_serialized(doc):
 
     assert "check" not in reloaded_doc.user_data
 
+
 @pytest.mark.parametrize(
     "tokens_a,tokens_b,expected",
     [
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index 7830196bc..465406a49 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -50,7 +50,9 @@ def pretrain(
     # TODO: move this to logger function?
     tracker = ProgressTracker(frequency=10000)
     if P["n_save_epoch"]:
-        msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch")
+        msg.divider(
+            f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch"
+        )
     else:
         msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
     row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
diff --git a/spacy/util.py b/spacy/util.py
index e14f6030f..4424f6897 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -288,16 +288,17 @@ def find_matching_language(lang: str) -> Optional[str]:
     None
     """
     import spacy.lang  # noqa: F401
-    if lang == 'xx':
-        return 'xx'
+
+    if lang == "xx":
+        return "xx"
 
     # Find out which language modules we have
     possible_languages = []
     for modinfo in pkgutil.iter_modules(spacy.lang.__path__):  # type: ignore
         code = modinfo.name
-        if code == 'xx':
+        if code == "xx":
             # Temporarily make 'xx' into a valid language code
-            possible_languages.append('mul')
+            possible_languages.append("mul")
         elif langcodes.tag_is_valid(code):
             possible_languages.append(code)
 
@@ -306,12 +307,10 @@ def find_matching_language(lang: str) -> Optional[str]:
     # more possibilities, like variants of Chinese like 'wuu', but text that
     # is labeled that way is probably trying to be distinct from 'zh' and
     # shouldn't automatically match.
-    match = langcodes.closest_supported_match(
-        lang, possible_languages, max_distance=9
-    )
-    if match == 'mul':
+    match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
+    if match == "mul":
         # Convert 'mul' back to spaCy's 'xx'
-        return 'xx'
+        return "xx"
     else:
         return match
 

From 5cdb7eb5c28a6d37889fc355b540e1665dcd5d9b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 5 Nov 2021 09:58:36 +0100
Subject: [PATCH 064/133] Auto-format code with black (#9631)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/tokens/doc.pyi | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 46a10df03..f540002c9 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -139,8 +139,12 @@ class Doc:
     def count_by(
         self, attr_id: int, exclude: Optional[Any] = ..., counts: Optional[Any] = ...
     ) -> Dict[Any, int]: ...
-    def from_array(self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d) -> Doc: ...
-    def to_array(self, py_attr_ids: Union[int, str, List[Union[int, str]]]) -> numpy.ndarray: ...
+    def from_array(
+        self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d
+    ) -> Doc: ...
+    def to_array(
+        self, py_attr_ids: Union[int, str, List[Union[int, str]]]
+    ) -> numpy.ndarray: ...
     @staticmethod
     def from_docs(
         docs: List[Doc],

From 216ed231a988640841a7e6c7d936a9b00fd9ed1a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 5 Nov 2021 16:31:14 +0100
Subject: [PATCH 065/133] What's new in v3.2 (#9633)

* What's new in v3.2

* Fix formatting

* Fix typo

* Redo thanks

* Formatting

* Fix typo

* Fix project links

* Fix typo

* Minimal intro, floret python module

* Rephrase

* Rephrase, extend

* Rephrase

* Update links and formatting [ci skip]

* Minor correction

* Fix typo

Co-authored-by: Ines Montani <ines@ines.io>
---
 website/docs/usage/v3-2.md     | 244 +++++++++++++++++++++++++++++++++
 website/meta/sidebars.json     |   3 +-
 website/src/templates/index.js |   4 +-
 3 files changed, 248 insertions(+), 3 deletions(-)
 create mode 100644 website/docs/usage/v3-2.md

diff --git a/website/docs/usage/v3-2.md b/website/docs/usage/v3-2.md
new file mode 100644
index 000000000..766d1c0a9
--- /dev/null
+++ b/website/docs/usage/v3-2.md
@@ -0,0 +1,244 @@
+---
+title: What's New in v3.2
+teaser: New features and how to upgrade
+menu:
+  - ['New Features', 'features']
+  - ['Upgrading Notes', 'upgrading']
+---
+
+## New Features {#features hidden="true"}
+
+spaCy v3.2 adds support for [`floret`](https://github.com/explosion/floret)
+vectors, makes custom `Doc` creation and scoring easier, and includes many bug
+fixes and improvements. For the trained pipelines, there's a new transformer
+pipeline for Japanese and the Universal Dependencies training data has been
+updated across the board to the most recent release.
+
+<Infobox title="Improve performance for spaCy on Apple M1 with AppleOps" variant="warning" emoji="📣">
+
+spaCy is now up to **8 &times; faster on M1 Macs** by calling into Apple's
+native Accelerate library for matrix multiplication. For more details, see
+[`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops).
+
+```bash
+$ pip install spacy[apple]
+```
+
+</Infobox>
+
+### Registered scoring functions {#registered-scoring-functions}
+
+To customize the scoring, you can specify a scoring function for each component
+in your config from the new [`scorers` registry](/api/top-level#registry):
+
+```ini
+### config.cfg (excerpt) {highlight="3"}
+[components.tagger]
+factory = "tagger"
+scorer = {"@scorers":"spacy.tagger_scorer.v1"}
+```
+
+### Overwrite settings {#overwrite}
+
+Most pipeline components now include an `overwrite` setting in the config that
+determines whether existing annotation in the `Doc` is preserved or overwritten:
+
+```ini
+### config.cfg (excerpt) {highlight="3"}
+[components.tagger]
+factory = "tagger"
+overwrite = false
+```
+
+### Doc input for pipelines {#doc-input}
+
+[`nlp`](/api/language#call) and [`nlp.pipe`](/api/language#pipe) accept
+[`Doc`](/api/doc) input, skipping the tokenizer if a `Doc` is provided instead
+of a string. This makes it easier to create a `Doc` with custom tokenization or
+to set custom extensions before processing:
+
+```python
+doc = nlp.make_doc("This is text 500.")
+doc._.text_id = 500
+doc = nlp(doc)
+```
+
+### Support for floret vectors {#vectors}
+
+We recently published [`floret`](https://github.com/explosion/floret), an
+extended version of [fastText](https://fasttext.cc) that combines fastText's
+subwords with Bloom embeddings for compact, full-coverage vectors. The use of
+subwords means that there are no OOV words and due to Bloom embeddings, the
+vector table can be kept very small at <100K entries. Bloom embeddings are
+already used by [HashEmbed](https://thinc.ai/docs/api-layers#hashembed) in
+[tok2vec](/api/architectures#tok2vec-arch) for compact spaCy models.
+
+For easy integration, floret includes a
+[Python wrapper](https://github.com/explosion/floret/blob/main/python/README.md):
+
+```bash
+$ pip install floret
+```
+
+A demo project shows how to train and import floret vectors:
+
+<Project id="pipelines/floret_vectors_demo">
+
+Train toy English floret vectors and import them into a spaCy pipeline.
+
+</Project>
+
+Two additional demo projects compare standard fastText vectors with floret
+vectors for full spaCy pipelines. For agglutinative languages like Finnish or
+Korean, there are large improvements in performance due to the use of subwords
+(no OOV words!), with a vector table containing merely 50K entries.
+
+<Project id="pipelines/floret_fi_core_demo">
+
+Finnish UD+NER vector and pipeline training, comparing standard fasttext vs.
+floret vectors.
+
+For the default project settings with 1M (2.6G) tokenized training texts and 50K
+300-dim vectors, ~300K keys for the standard vectors:
+
+| Vectors                                      |      TAG |      POS |  DEP UAS |  DEP LAS |    NER F |
+| -------------------------------------------- | -------: | -------: | -------: | -------: | -------: |
+| none                                         |     93.3 |     92.3 |     79.7 |     72.8 |     61.0 |
+| standard (pruned: 50K vectors for 300K keys) |     95.9 |     94.7 |     83.3 |     77.9 |     68.5 |
+| standard (unpruned: 300K vectors/keys)       |     96.0 |     95.0 | **83.8** |     78.4 |     69.1 |
+| floret (minn 4, maxn 5; 50K vectors, no OOV) | **96.6** | **95.5** |     83.5 | **78.5** | **70.9** |
+
+</Project>
+
+<Project id="pipelines/floret_ko_ud_demo">
+
+Korean UD vector and pipeline training, comparing standard fasttext vs. floret
+vectors.
+
+For the default project settings with 1M (3.3G) tokenized training texts and 50K
+300-dim vectors, ~800K keys for the standard vectors:
+
+| Vectors                                      |      TAG |      POS |  DEP UAS |  DEP LAS |
+| -------------------------------------------- | -------: | -------: | -------: | -------: |
+| none                                         |     72.5 |     85.0 |     73.2 |     64.3 |
+| standard (pruned: 50K vectors for 800K keys) |     77.9 |     89.4 |     78.8 |     72.8 |
+| standard (unpruned: 800K vectors/keys)       |     79.0 |     90.2 |     79.2 |     73.9 |
+| floret (minn 2, maxn 3; 50K vectors, no OOV) | **82.5** | **93.8** | **83.0** | **80.1** |
+
+</Project>
+
+### Updates for spacy-transformers v1.1 {#spacy-transformers}
+
+[`spacy-transformers`](https://github.com/explosion/spacy-transformers) v1.1 has
+been refactored to improve serialization and support of inline transformer
+components and replacing listeners. In addition, the transformer model output is
+provided as
+[`ModelOutput`](https://huggingface.co/transformers/main_classes/output.html?highlight=modeloutput#transformers.file_utils.ModelOutput)
+instead of tuples in
+`TransformerData.model_output and FullTransformerBatch.model_output.` For
+backwards compatibility, the tuple format remains available under
+`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
+in the [transformer API docs](/api/architectures#TransformerModel).
+
+`spacy-transfomers` v1.1 also adds support for `transformer_config` settings
+such as `output_attentions`. Additional output is stored under
+`TransformerData.model_output`. More details are in the
+[TransformerModel docs](/api/architectures#TransformerModel). The training speed
+has been improved by streamlining allocations for tokenizer output and there is
+new support for [mixed-precision training](/api/architectures#TransformerModel).
+
+### New transformer package for Japanese {#pipeline-packages}
+
+spaCy v3.2 adds a new transformer pipeline package for Japanese
+[`ja_core_news_trf`](/models/ja#ja_core_news_trf), which uses the `basic`
+pretokenizer instead of `mecab` to limit the number of dependencies required for
+the pipeline. Thanks to Hiroshi Matsuda and the spaCy Japanese community for
+their contributions!
+
+### Pipeline and language updates {#pipeline-updates}
+
+- All Universal Dependencies training data has been updated to v2.8.
+- The Catalan data, tokenizer and lemmatizer have been updated, thanks to Carlos
+  Rodriguez and the Barcelona Supercomputing Center!
+- The transformer pipelines are trained using spacy-transformers v1.1, with
+  improved IO and more options for
+  [model config and output](/api/architectures#TransformerModel).
+- Trailing whitespace has been added as a `tok2vec` feature, improving the
+  performance for many components, especially fine-grained tagging and sentence
+  segmentation.
+- The English attribute ruler patterns have been overhauled to improve
+  `Token.pos` and `Token.morph`.
+
+spaCy v3.2 also features a new Irish lemmatizer, support for `noun_chunks` in
+Portuguese, improved `noun_chunks` for Spanish and additional updates for
+Bulgarian, Catalan, Sinhala, Tagalog, Tigrinya and Vietnamese.
+
+## Notes about upgrading from v3.1 {#upgrading}
+
+### Pipeline package version compatibility {#version-compat}
+
+> #### Using legacy implementations
+>
+> In spaCy v3, you'll still be able to load and reference legacy implementations
+> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
+> components or architectures change and newer versions are available in the
+> core library.
+
+When you're loading a pipeline package trained with spaCy v3.0 or v3.1, you will
+see a warning telling you that the pipeline may be incompatible. This doesn't
+necessarily have to be true, but we recommend running your pipelines against
+your test suite or evaluation data to make sure there are no unexpected results.
+If you're using one of the [trained pipelines](/models) we provide, you should
+run [`spacy download`](/api/cli#download) to update to the latest version. To
+see an overview of all installed packages and their compatibility, you can run
+[`spacy validate`](/api/cli#validate).
+
+If you've trained your own custom pipeline and you've confirmed that it's still
+working as expected, you can update the spaCy version requirements in the
+[`meta.json`](/api/data-formats#meta):
+
+```diff
+- "spacy_version": ">=3.1.0,<3.2.0",
++ "spacy_version": ">=3.2.0,<3.3.0",
+```
+
+### Updating v3.1 configs
+
+To update a config from spaCy v3.1 with the new v3.2 settings, run
+[`init fill-config`](/api/cli#init-fill-config):
+
+```cli
+$ python -m spacy init fill-config config-v3.1.cfg config-v3.2.cfg
+```
+
+In many cases ([`spacy train`](/api/cli#train),
+[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
+automatically, but you'll need to fill in the new settings to run
+[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
+
+## Notes about upgrading from spacy-transformers v1.0 {#upgrading-transformers}
+
+When you're loading a transformer pipeline package trained with
+[`spacy-transformers`](https://github.com/explosion/spacy-transformers) v1.0
+after upgrading to `spacy-transformers` v1.1, you'll see a warning telling you
+that the pipeline may be incompatible. `spacy-transformers` v1.1 should be able
+to import v1.0 `transformer` components into the new internal format with no
+change in performance, but here we'd also recommend running your test suite to
+verify that the pipeline still performs as expected.
+
+If you save your pipeline with [`nlp.to_disk`](/api/language#to_disk), it will
+be saved in the new v1.1 format and should be fully compatible with
+`spacy-transformers` v1.1. Once you've confirmed the performance, you can update
+the requirements in [`meta.json`](/api/data-formats#meta):
+
+```diff
+  "requirements": [
+-    "spacy-transformers>=1.0.3,<1.1.0"
++    "spacy-transformers>=1.1.2,<1.2.0"
+  ]
+```
+
+If you're using one of the [trained pipelines](/models) we provide, you should
+run [`spacy download`](/api/cli#download) to update to the latest version. To
+see an overview of all installed packages and their compatibility, you can run
+[`spacy validate`](/api/cli#validate).
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 6fe09f052..1054f7626 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -10,7 +10,8 @@
                     { "text": "Facts & Figures", "url": "/usage/facts-figures" },
                     { "text": "spaCy 101", "url": "/usage/spacy-101" },
                     { "text": "New in v3.0", "url": "/usage/v3" },
-                    { "text": "New in v3.1", "url": "/usage/v3-1" }
+                    { "text": "New in v3.1", "url": "/usage/v3-1" },
+                    { "text": "New in v3.2", "url": "/usage/v3-2" }
                 ]
             },
             {
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index 2c68ff056..56ac0dbed 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -119,8 +119,8 @@ const AlertSpace = ({ nightly, legacy }) => {
 }
 
 const navAlert = (
-    <Link to="/usage/v3-1" hidden>
-        <strong>💥 Out now:</strong> spaCy v3.1
+    <Link to="/usage/v3-2" hidden>
+        <strong>💥 Out now:</strong> spaCy v3.2
     </Link>
 )
 

From 86af0234abfd2d3d2a609bd46eaa4b475477eb3b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 5 Nov 2021 19:02:35 +0100
Subject: [PATCH 066/133] Update version [ci skip]

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 61d5449a4..57d76fb45 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
 open-source software, released under the MIT license.
 
-💫 **Version 3.0 out now!**
+💫 **Version 3.2 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 
 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)

From 909177589dcdbde1cd4770f9f744d4d57d08d7e0 Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Sat, 6 Nov 2021 06:35:58 +0800
Subject: [PATCH 067/133] Remove utility script

---
 spacy/tests/regression/util_add_marker.py | 41 -----------------------
 1 file changed, 41 deletions(-)
 delete mode 100644 spacy/tests/regression/util_add_marker.py

diff --git a/spacy/tests/regression/util_add_marker.py b/spacy/tests/regression/util_add_marker.py
deleted file mode 100644
index 94fa415bc..000000000
--- a/spacy/tests/regression/util_add_marker.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import re
-from pathlib import Path
-from typing import Optional
-
-import typer
-
-
-def main(
-    filename: Path, out_file: Optional[Path] = typer.Option(None), dry_run: bool = False
-):
-    """Add pytest issue markers on regression tests
-
-    If --out-file is not used, it will overwrite the original file. You can set
-    the --dry-run flag to just see the changeset and not write to disk.
-    """
-    lines = []
-    with filename.open() as f:
-        lines = f.readlines()
-
-    # Regex pattern for matching common regression formats (e.g. test_issue1234)
-    pattern = r"def test_issue\d{1,4}"
-    regex = re.compile(pattern)
-
-    new_lines = []
-    for line_text in lines:
-        if regex.search(line_text):  # if match, append marker first
-            issue_num = int(re.findall(r"\d+", line_text)[0])  # Simple heuristic
-            typer.echo(f"Found: {line_text} with issue number: {issue_num}")
-            new_lines.append(f"@pytest.mark.issue({issue_num})\n")
-        new_lines.append(line_text)
-
-    # Save to file
-    if not dry_run:
-        out = out_file or filename
-        with out.open("w") as f:
-            for new_line in new_lines:
-                f.write(new_line)
-
-
-if __name__ == "__main__":
-    typer.run(main)

From 141f12b92e8a3ace4e3ba019f35ee2ff2fd8e7e3 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 7 Nov 2021 18:56:23 +0900
Subject: [PATCH 068/133] Make Jsonl Corpus reader optional again

---
 spacy/training/corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index b30d918fd..733fc267e 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -41,7 +41,7 @@ def create_docbin_reader(
 
 @util.registry.readers("spacy.JsonlCorpus.v1")
 def create_jsonl_reader(
-    path: Union[str, Path], min_length: int = 0, max_length: int = 0, limit: int = 0
+    path: Union[None, str, Path], min_length: int = 0, max_length: int = 0, limit: int = 0
 ) -> Callable[["Language"], Iterable[Example]]:
     return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
 

From 71fb00ed954fd758cc743d80dc52e77f3e9bc689 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 8 Nov 2021 10:02:29 +0000
Subject: [PATCH 069/133] Update spacy/training/corpus.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/training/corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 733fc267e..aab2eae94 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -41,7 +41,7 @@ def create_docbin_reader(
 
 @util.registry.readers("spacy.JsonlCorpus.v1")
 def create_jsonl_reader(
-    path: Union[None, str, Path], min_length: int = 0, max_length: int = 0, limit: int = 0
+    path: Optional[Union[str, Path]], min_length: int = 0, max_length: int = 0, limit: int = 0
 ) -> Callable[["Language"], Iterable[Example]]:
     return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
 

From 8aa2d32ca92d3501695cdb057bcc7d479a0ab1df Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 8 Nov 2021 19:03:47 +0900
Subject: [PATCH 070/133] Update jsonlcorpus constructor types

---
 spacy/training/corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index aab2eae94..cf643feec 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -221,7 +221,7 @@ class JsonlCorpus:
 
     def __init__(
         self,
-        path: Union[str, Path],
+        path: Optional[Union[str, Path]],
         *,
         limit: int = 0,
         min_length: int = 0,

From 67d8c8a081fa87763184e5fc794dc8542e3eb63e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 12 Nov 2021 10:00:03 +0100
Subject: [PATCH 071/133] Auto-format code with black (#9664)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 spacy/training/corpus.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index cf643feec..b9f929fcd 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -41,7 +41,10 @@ def create_docbin_reader(
 
 @util.registry.readers("spacy.JsonlCorpus.v1")
 def create_jsonl_reader(
-    path: Optional[Union[str, Path]], min_length: int = 0, max_length: int = 0, limit: int = 0
+    path: Optional[Union[str, Path]],
+    min_length: int = 0,
+    max_length: int = 0,
+    limit: int = 0,
 ) -> Callable[["Language"], Iterable[Example]]:
     return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
 

From c9baf9d196cba07fe1b1c636bcab3c80c6b81b44 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 15 Nov 2021 12:40:55 +0100
Subject: [PATCH 072/133] Fix spancat for empty docs and zero suggestions
 (#9654)

* Fix spancat for empty docs and zero suggestions

* Use ops.xp.zeros in test
---
 spacy/ml/extract_spans.py            | 10 +++++++--
 spacy/pipeline/spancat.py            |  2 +-
 spacy/tests/pipeline/test_spancat.py | 31 +++++++++++++++++++++++++++-
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py
index 9bc972032..edc86ff9c 100644
--- a/spacy/ml/extract_spans.py
+++ b/spacy/ml/extract_spans.py
@@ -28,7 +28,13 @@ def forward(
     X, spans = source_spans
     assert spans.dataXd.ndim == 2
     indices = _get_span_indices(ops, spans, X.lengths)
-    Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])  # type: ignore[arg-type, index]
+    if len(indices) > 0:
+        Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])  # type: ignore[arg-type, index]
+    else:
+        Y = Ragged(
+            ops.xp.zeros(X.dataXd.shape, dtype=X.dataXd.dtype),
+            ops.xp.zeros((len(X.lengths),), dtype="i"),
+        )
     x_shape = X.dataXd.shape
     x_lengths = X.lengths
 
@@ -53,7 +59,7 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
         for j in range(spans_i.shape[0]):
             indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))  # type: ignore[call-overload, index]
         offset += length
-    return ops.flatten(indices)
+    return ops.flatten(indices, dtype="i", ndim_if_empty=1)
 
 
 def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 5b84ce8fb..829def1eb 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -78,7 +78,7 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
         if len(spans) > 0:
             output = Ragged(ops.xp.vstack(spans), lengths_array)
         else:
-            output = Ragged(ops.xp.zeros((0, 0)), lengths_array)
+            output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
 
         assert output.dataXd.ndim == 2
         return output
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 5c3a9d27d..2f7e952d3 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,7 +1,7 @@
 import pytest
 import numpy
 from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops
+from thinc.api import get_current_ops, Ragged
 
 from spacy import util
 from spacy.lang.en import English
@@ -29,6 +29,7 @@ TRAIN_DATA_OVERLAPPING = [
         "I like London and Berlin",
         {"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC"), (7, 24, "DOUBLE_LOC")]}},
     ),
+    ("", {"spans": {SPAN_KEY: []}}),
 ]
 
 
@@ -365,3 +366,31 @@ def test_overfitting_IO_overlapping():
             "London and Berlin",
         }
         assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
+
+
+def test_zero_suggestions():
+    # Test with a suggester that returns 0 suggestions
+
+    @registry.misc("test_zero_suggester")
+    def make_zero_suggester():
+        def zero_suggester(docs, *, ops=None):
+            if ops is None:
+                ops = get_current_ops()
+            return Ragged(
+                ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
+            )
+
+        return zero_suggester
+
+    fix_random_seed(0)
+    nlp = English()
+    spancat = nlp.add_pipe(
+        "spancat",
+        config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
+    )
+    train_examples = make_examples(nlp)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    assert spancat.model.get_dim("nO") == 2
+    assert set(spancat.labels) == {"LOC", "PERSON"}
+
+    nlp.update(train_examples, sgd=optimizer)

From 86fa37e8baf631348ec712a174c19c3ca7fb88cd Mon Sep 17 00:00:00 2001
From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com>
Date: Tue, 16 Nov 2021 10:36:19 +0530
Subject: [PATCH 073/133] Update universe.json with new library
 eng_spacysentiment (#9679)

* Update universe.json

* Update universe.json

* Cleanup fields

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 website/meta/universe.json | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 9b7484a13..7f3813a95 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -3592,6 +3592,32 @@
                 "github": "xxyzz"
             },
             "category": ["standalone"]
+        },
+        {
+            "id": "eng_spacysentiment",
+            "title": "eng_spacysentiment",
+            "slogan": "Simple sentiment analysis using spaCy pipelines",
+            "description": "Sentiment analysis for simple english sentences using pre-trained spaCy pipelines",
+            "github": "vishnunkumar/spacysentiment",
+            "pip": "eng-spacysentiment",
+            "code_example": [
+                "import eng_spacysentiment",
+                "nlp = eng_spacysentiment.load()",
+                "text = \"Welcome to Arsenals official YouTube channel Watch as we take you closer and show you the personality of the club\"",
+                "doc = nlp(text)",
+                "print(doc.cats)",
+                "# {'positive': 0.29878824949264526, 'negative': 0.7012117505073547}"
+            ],
+            "thumb": "",
+            "image": "",
+            "code_language": "python",
+            "author": "Vishnu Nandakumar",
+            "author_links": {
+                "github": "Vishnunkumar",
+                "twitter": "vishnun_uchiha"
+            },
+            "category": ["pipeline"],
+            "tags": ["pipeline", "nlp", "sentiment"]
         }
     ],
 

From f3981bd0c87b5f686593e51a53825b2c718eac6e Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 18 Nov 2021 14:38:30 +0000
Subject: [PATCH 074/133] Clarify how to fill in init_tok2vec after pretraining
 (#9639)

* Clarify how to fill in init_tok2vec after pretraining

* Ignore init_tok2vec arg in pretraining

* Update docs, config setting

* Remove obsolete note about not filling init_tok2vec early

This seems to have also caught some lines that needed cleanup.
---
 spacy/training/pretrain.py                    |  2 ++
 website/docs/api/data-formats.md              |  2 +-
 website/docs/usage/embeddings-transformers.md | 35 +++++++++----------
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index 465406a49..52af84aaf 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -31,6 +31,8 @@ def pretrain(
     allocator = config["training"]["gpu_allocator"]
     if use_gpu >= 0 and allocator:
         set_gpu_allocator(allocator)
+    # ignore in pretraining because we're creating it now
+    config["initialize"]["init_tok2vec"] = None
     nlp = load_model_from_config(config)
     _config = nlp.config.interpolate()
     P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 001455f33..c6cd92799 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -248,7 +248,7 @@ Also see the usage guides on the
 | `after_init`   | Optional callback to modify the `nlp` object after initialization. ~~Optional[Callable[[Language], Language]]~~                                                                                                                                                                                                                                                                                                |
 | `before_init`  | Optional callback to modify the `nlp` object before initialization. ~~Optional[Callable[[Language], Language]]~~                                                                                                                                                                                                                                                                                               |
 | `components`   | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~                                                                      |
-| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                                                                                                                                                                                                |
+| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. Ignored when actually running pretraining, as you're creating the file to be used later. ~~Optional[str]~~                                                                                                                                                       |
 | `lookups`      | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                                                                                                                                                                                                                       |
 | `tokenizer`    | Additional arguments passed to the `initialize` method of the specified tokenizer. Can be used for languages like Chinese that depend on dictionaries or trained models for tokenization. If type annotations are available on the method, the config will be validated against them. The `initialize` method will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Any]~~ |
 | `vectors`      | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vectors`](/api/cli#init-vectors). Defaults to `null`. ~~Optional[str]~~                                                                                                                                                                                                                                           |
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index febed6f2f..708cdd8bf 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -391,8 +391,8 @@ A wide variety of PyTorch models are supported, but some might not work. If a
 model doesn't seem to work feel free to open an
 [issue](https://github.com/explosion/spacy/issues). Additionally note that
 Transformers loaded in spaCy can only be used for tensors, and pretrained
-task-specific heads or text generation features cannot be used as part of 
-the `transformer` pipeline component.
+task-specific heads or text generation features cannot be used as part of the
+`transformer` pipeline component.
 
 <Infobox variant="warning">
 
@@ -715,8 +715,8 @@ network for a temporary task that forces the model to learn something about
 sentence structure and word cooccurrence statistics.
 
 Pretraining produces a **binary weights file** that can be loaded back in at the
-start of training, using the configuration option `initialize.init_tok2vec`.
-The weights file specifies an initial set of weights. Training then proceeds as
+start of training, using the configuration option `initialize.init_tok2vec`. The
+weights file specifies an initial set of weights. Training then proceeds as
 normal.
 
 You can only pretrain one subnetwork from your pipeline at a time, and the
@@ -751,15 +751,14 @@ layer = "tok2vec"
 
 #### Connecting pretraining to training {#pretraining-training}
 
-To benefit from pretraining, your training step needs to know to initialize
-its `tok2vec` component with the weights learned from the pretraining step.
-You do this by setting `initialize.init_tok2vec` to the filename of the
-`.bin` file that you want to use from pretraining.
+To benefit from pretraining, your training step needs to know to initialize its
+`tok2vec` component with the weights learned from the pretraining step. You do
+this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
+that you want to use from pretraining.
 
-A pretraining step that runs for 5 epochs with an output path of `pretrain/`,
-as an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`.
-To make use of the final output, you could fill in this value in your config
-file:
+A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
+an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
+make use of the final output, you could fill in this value in your config file:
 
 ```ini
 ### config.cfg
@@ -773,16 +772,14 @@ init_tok2vec = ${paths.init_tok2vec}
 
 <Infobox variant="warning">
 
-The outputs of `spacy pretrain` are not the same data format as the
-pre-packaged static word vectors that would go into 
-[`initialize.vectors`](/api/data-formats#config-initialize).
-The pretraining output consists of the weights that the `tok2vec`
-component should start with in an existing pipeline, so it goes in
-`initialize.init_tok2vec`.
+The outputs of `spacy pretrain` are not the same data format as the pre-packaged
+static word vectors that would go into
+[`initialize.vectors`](/api/data-formats#config-initialize). The pretraining
+output consists of the weights that the `tok2vec` component should start with in
+an existing pipeline, so it goes in `initialize.init_tok2vec`.
 
 </Infobox>
 
-
 #### Pretraining objectives {#pretraining-objectives}
 
 > ```ini

From ea450d652c32f65b947a1e1a498b45f29ed4dc29 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 19 Nov 2021 08:51:19 +0100
Subject: [PATCH 075/133] Exclude strings from v3.2+ source vector checks
 (#9697)

Exclude strings from `Vector.to_bytes()` comparions for v3.2+ `Vectors`
that now include the string store so that the source vector comparison
is only comparing the vectors and not the strings.
---
 spacy/language.py            | 7 +++++--
 spacy/training/initialize.py | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index aa57989ac..204b24ecb 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -701,7 +701,8 @@ class Language:
         if (
             self.vocab.vectors.shape != source.vocab.vectors.shape
             or self.vocab.vectors.key2row != source.vocab.vectors.key2row
-            or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes()
+            or self.vocab.vectors.to_bytes(exclude=["strings"])
+            != source.vocab.vectors.to_bytes(exclude=["strings"])
         ):
             warnings.warn(Warnings.W113.format(name=source_name))
         if source_name not in source.component_names:
@@ -1822,7 +1823,9 @@ class Language:
                         )
                     if model not in source_nlp_vectors_hashes:
                         source_nlp_vectors_hashes[model] = hash(
-                            source_nlps[model].vocab.vectors.to_bytes()
+                            source_nlps[model].vocab.vectors.to_bytes(
+                                exclude=["strings"]
+                            )
                         )
                     if "_sourced_vectors_hashes" not in nlp.meta:
                         nlp.meta["_sourced_vectors_hashes"] = {}
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 13ccfeb93..084204389 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -132,7 +132,7 @@ def init_vocab(
         logger.info(f"Added vectors: {vectors}")
     # warn if source model vectors are not identical
     sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
-    vectors_hash = hash(nlp.vocab.vectors.to_bytes())
+    vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
     for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
         if vectors_hash != sourced_vectors_hash:
             warnings.warn(Warnings.W113.format(name=sourced_component))

From 0e93b315f3a5f96f2190d7eae7f6085bafe9c747 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 19 Nov 2021 08:51:46 +0100
Subject: [PATCH 076/133] Convert labels to strings for README in package CLI
 (#9694)

---
 spacy/cli/package.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index e76343dc3..76e14daf5 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -397,7 +397,7 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
             continue
         col1 = md.bold(md.code(pipe))
         col2 = ", ".join(
-            [md.code(label.replace("|", "\\|")) for label in labels]
+            [md.code(str(label).replace("|", "\\|")) for label in labels]
         )  # noqa: W605
         label_data.append((col1, col2))
         n_labels += len(labels)

From 13645dcbf5b2fe567be41d039c4cc4ebdae79ed6 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 22 Nov 2021 06:43:11 +0100
Subject: [PATCH 077/133] add note that annotating components is new since 3.1
 (#9678)

---
 website/docs/api/data-formats.md | 38 ++++++++++++++++----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index c6cd92799..c51a6dbca 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -181,25 +181,25 @@ single corpus once and then divide it up into `train` and `dev` partitions.
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
 
-| Name                    | Description                                                                                                                                                                                                                                                                                                                         |
-| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `accumulate_gradient`   | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
-| `batcher`               | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
-| `before_to_disk`        | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
-| `dev_corpus`            | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
-| `dropout`               | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
-| `eval_frequency`        | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
-| `frozen_components`     | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
-| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                          |
-| `gpu_allocator`         | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
-| `logger`                | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
-| `max_epochs`            | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
-| `max_steps`             | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
-| `optimizer`             | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
-| `patience`              | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
-| `score_weights`         | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
-| `seed`                  | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
-| `train_corpus`          | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
+| Name                                                 | Description                                                                                                                                                                                                                                                                                                                         |
+| ---------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient`                                | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
+| `batcher`                                            | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
+| `before_to_disk`                                     | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
+| `dev_corpus`                                         | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
+| `dropout`                                            | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
+| `eval_frequency`                                     | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
+| `frozen_components`                                  | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
+| `annotating_components` <Tag variant="new">3.1</Tag> | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                          |
+| `gpu_allocator`                                      | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
+| `logger`                                             | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
+| `max_epochs`                                         | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
+| `max_steps`                                          | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
+| `optimizer`                                          | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
+| `patience`                                           | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
+| `score_weights`                                      | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
+| `seed`                                               | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
+| `train_corpus`                                       | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 

From 52b8c2d2e0241e1c515131c5e5f576d5dad65059 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 22 Nov 2021 10:06:07 +0000
Subject: [PATCH 078/133] Add note on batch contract for listeners (#9691)

* Add note on batch contract

Using listeners requires batches to be consistent. This is obvious if
you understand how the listener works, but it wasn't clearly stated in
the Docs, and was subtle enough that the EntityLinker missed it.

There is probably a clearer way to explain what the actual requirement
is, but I figure this is a good start.

* Rewrite to clarify role of caching
---
 website/docs/api/architectures.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 01ca4540b..44ba94d9e 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -124,6 +124,14 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
 [Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
 argument that connects to the shared `tok2vec` component in the pipeline.
 
+Listeners work by caching the `Tok2Vec` output for a given batch of `Doc`s. This
+means that in order for a component to work with the listener, the batch of
+`Doc`s passed to the listener must be the same as the batch of `Doc`s passed to
+the `Tok2Vec`. As a result, any manipulation of the `Doc`s which would affect
+`Tok2Vec` output, such as to create special contexts or remove `Doc`s for which
+no prediction can be made, must happen inside the model, **after** the call to
+the `Tok2Vec` component.
+
 | Name        | Description                                                                                                                                                                                                                                                                                                                          |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `width`     | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~                                                                                                                                                                                                                                     |

From 36c70479468b10e1c8578a5a75dec9e908340a6f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 23 Nov 2021 14:55:55 +0100
Subject: [PATCH 079/133] Use reference parse to initialize parser moves
 (#9722)

---
 spacy/pipeline/_parser_internals/arc_eager.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index f34975858..ddcc911c8 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -585,7 +585,10 @@ cdef class ArcEager(TransitionSystem):
             actions[RIGHT][label] = 1
             actions[REDUCE][label] = 1
         for example in kwargs.get('examples', []):
-            heads, labels = example.get_aligned_parse(projectivize=True)
+            # use heads and labels from the reference parse (without regard to
+            # misalignments between the predicted and reference)
+            example_gold_preproc = Example(example.reference, example.reference)
+            heads, labels = example_gold_preproc.get_aligned_parse(projectivize=True)
             for child, (head, label) in enumerate(zip(heads, labels)):
                 if head is None or label is None:
                     continue

From a77f50baa43029d3676fdaa6079e0635444de21b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 23 Nov 2021 15:17:19 +0100
Subject: [PATCH 080/133] Allow Scorer.score_spans to handle pred docs with
 missing annotation (#9701)

If the predicted docs are missing annotation according to
`has_annotation`, treat the docs as having no predictions rather than
raising errors when the annotation is missing.

The motivation for this is a combined tokenization+sents scorer for a
component where the sents annotation is optional. To provide a single
scorer in the component factory, it needs to be possible for the scorer
to continue despite missing sents annotation in the case where the
component is not annotating sents.
---
 spacy/scorer.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index cfdf34e62..4d596b5e1 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -359,14 +359,15 @@ class Scorer:
             pred_doc = example.predicted
             gold_doc = example.reference
             # Option to handle docs without annotation for this attribute
-            if has_annotation is not None:
-                if not has_annotation(gold_doc):
-                    continue
-            # Find all labels in gold and doc
-            labels = set(
-                [k.label_ for k in getter(gold_doc, attr)]
-                + [k.label_ for k in getter(pred_doc, attr)]
-            )
+            if has_annotation is not None and not has_annotation(gold_doc):
+                continue
+            # Find all labels in gold
+            labels = set([k.label_ for k in getter(gold_doc, attr)])
+            # If labeled, find all labels in pred
+            if has_annotation is None or (
+                has_annotation is not None and has_annotation(pred_doc)
+            ):
+                labels |= set([k.label_ for k in getter(pred_doc, attr)])
             # Set up all labels for per type scoring and prepare gold per type
             gold_per_type: Dict[str, Set] = {label: set() for label in labels}
             for label in labels:
@@ -384,16 +385,19 @@ class Scorer:
                 gold_spans.add(gold_span)
                 gold_per_type[span.label_].add(gold_span)
             pred_per_type: Dict[str, Set] = {label: set() for label in labels}
-            for span in example.get_aligned_spans_x2y(
-                getter(pred_doc, attr), allow_overlap
+            if has_annotation is None or (
+                has_annotation is not None and has_annotation(pred_doc)
             ):
-                pred_span: Tuple
-                if labeled:
-                    pred_span = (span.label_, span.start, span.end - 1)
-                else:
-                    pred_span = (span.start, span.end - 1)
-                pred_spans.add(pred_span)
-                pred_per_type[span.label_].add(pred_span)
+                for span in example.get_aligned_spans_x2y(
+                    getter(pred_doc, attr), allow_overlap
+                ):
+                    pred_span: Tuple
+                    if labeled:
+                        pred_span = (span.label_, span.start, span.end - 1)
+                    else:
+                        pred_span = (span.start, span.end - 1)
+                    pred_spans.add(pred_span)
+                    pred_per_type[span.label_].add(pred_span)
             # Scores per label
             if labeled:
                 for k, v in score_per_type.items():

From 9ac6d4991eb34d47f2e42bf7418918d49cf76219 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 23 Nov 2021 15:33:33 +0100
Subject: [PATCH 081/133] Add doc_cleaner component (#9659)

* Add doc_cleaner component

* Fix types

* Fix loop

* Rephrase method description
---
 spacy/errors.py                        |  1 +
 spacy/pipeline/functions.py            | 64 ++++++++++++++++++++++++++
 spacy/tests/pipeline/test_functions.py | 25 ++++++++++
 website/docs/api/pipeline-functions.md | 22 +++++++++
 4 files changed, 112 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index 5fe550145..84c407422 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -191,6 +191,7 @@ class Warnings(metaclass=ErrorsWithCodes):
             "lead to errors.")
     W115 = ("Skipping {method}: the floret vector table cannot be modified. "
             "Vectors are calculated from character ngrams.")
+    W116 = ("Unable to clean attribute '{attr}'.")
 
 
 class Errors(metaclass=ErrorsWithCodes):
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index f0a75dc2c..c005395bf 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -1,6 +1,8 @@
 from typing import Dict, Any
 import srsly
+import warnings
 
+from ..errors import Warnings
 from ..language import Language
 from ..matcher import Matcher
 from ..tokens import Doc
@@ -136,3 +138,65 @@ class TokenSplitter:
             "cfg": lambda p: self._set_config(srsly.read_json(p)),
         }
         util.from_disk(path, serializers, [])
+
+
+@Language.factory(
+    "doc_cleaner",
+    default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
+)
+def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool):
+    return DocCleaner(attrs, silent=silent)
+
+
+class DocCleaner:
+    def __init__(self, attrs: Dict[str, Any], *, silent: bool = True):
+        self.cfg: Dict[str, Any] = {"attrs": dict(attrs), "silent": silent}
+
+    def __call__(self, doc: Doc) -> Doc:
+        attrs: dict = self.cfg["attrs"]
+        silent: bool = self.cfg["silent"]
+        for attr, value in attrs.items():
+            obj = doc
+            parts = attr.split(".")
+            skip = False
+            for part in parts[:-1]:
+                if hasattr(obj, part):
+                    obj = getattr(obj, part)
+                else:
+                    skip = True
+                    if not silent:
+                        warnings.warn(Warnings.W116.format(attr=attr))
+            if not skip:
+                if hasattr(obj, parts[-1]):
+                    setattr(obj, parts[-1], value)
+                else:
+                    if not silent:
+                        warnings.warn(Warnings.W116.format(attr=attr))
+        return doc
+
+    def to_bytes(self, **kwargs):
+        serializers = {
+            "cfg": lambda: srsly.json_dumps(self.cfg),
+        }
+        return util.to_bytes(serializers, [])
+
+    def from_bytes(self, data, **kwargs):
+        deserializers = {
+            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
+        }
+        util.from_bytes(data, deserializers, [])
+        return self
+
+    def to_disk(self, path, **kwargs):
+        path = util.ensure_path(path)
+        serializers = {
+            "cfg": lambda p: srsly.write_json(p, self.cfg),
+        }
+        return util.to_disk(path, serializers, [])
+
+    def from_disk(self, path, **kwargs):
+        path = util.ensure_path(path)
+        serializers = {
+            "cfg": lambda p: self.cfg.update(srsly.read_json(p)),
+        }
+        util.from_disk(path, serializers, [])
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index 454d7b08b..e4adfe2fe 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -3,6 +3,8 @@ from spacy.pipeline.functions import merge_subtokens
 from spacy.language import Language
 from spacy.tokens import Span, Doc
 
+from ..doc.test_underscore import clean_underscore  # noqa: F401
+
 
 @pytest.fixture
 def doc(en_vocab):
@@ -74,3 +76,26 @@ def test_token_splitter():
         "i",
     ]
     assert all(len(t.text) <= token_splitter.split_length for t in doc)
+
+
+@pytest.mark.usefixtures("clean_underscore")
+def test_factories_doc_cleaner():
+    nlp = Language()
+    nlp.add_pipe("doc_cleaner")
+    doc = nlp.make_doc("text")
+    doc.tensor = [1, 2, 3]
+    doc = nlp(doc)
+    assert doc.tensor is None
+
+    nlp = Language()
+    nlp.add_pipe("doc_cleaner", config={"silent": False})
+    with pytest.warns(UserWarning):
+        doc = nlp("text")
+
+    Doc.set_extension("test_attr", default=-1)
+    nlp = Language()
+    nlp.add_pipe("doc_cleaner", config={"attrs": {"_.test_attr": 0}})
+    doc = nlp.make_doc("text")
+    doc._.test_attr = 100
+    doc = nlp(doc)
+    assert doc._.test_attr == 0
diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md
index a776eca9b..ff19d3e71 100644
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@@ -130,3 +130,25 @@ exceed the transformer model max length.
 | `min_length`   | The minimum length for a token to be split. Defaults to `25`. ~~int~~ |
 | `split_length` | The length of the split tokens. Defaults to `5`. ~~int~~              |
 | **RETURNS**    | The modified `Doc` with the split tokens. ~~Doc~~                     |
+
+## doc_cleaner {#doc_cleaner tag="function" new="3.2.1"}
+
+Clean up `Doc` attributes. Intended for use at the end of pipelines with
+`tok2vec` or `transformer` pipeline components that store tensors and other
+values that can require a lot of memory and frequently aren't needed after the
+whole pipeline has run.
+
+> #### Example
+>
+> ```python
+> config = {"attrs": {"tensor": None}}
+> nlp.add_pipe("doc_cleaner", config=config)
+> doc = nlp("text")
+> assert doc.tensor is None
+> ```
+
+| Setting     | Description                                                                                                                                                                         |
+| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `attrs`     | A dict of the `Doc` attributes and the values to set them to. Defaults to `{"tensor": None, "_.trf_data": None}` to clean up after `tok2vec` and `transformer` components. ~~dict~~ |
+| `silent`    | If `False`, show warnings if attributes aren't found or can't be set. Defaults to `True`. ~~bool~~                                                                                  |
+| **RETURNS** | The modified `Doc` with the modified attributes. ~~Doc~~                                                                                                                            |

From a7d7e80adb9f325efa209ef0deb7365bdc76ee04 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Tue, 23 Nov 2021 16:26:05 +0100
Subject: [PATCH 082/133] EntityRuler improve disk load error message  (#9658)

* added error string

* added serialization test

* added more to if statements

* wrote file to tempdir

* added tempdir

* changed parameter a bit

* Update spacy/tests/pipeline/test_entity_ruler.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py                           |  1 +
 spacy/pipeline/entityruler.py             | 12 ++++++++++--
 spacy/tests/pipeline/test_entity_ruler.py | 22 ++++++++++++++++++++++
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 84c407422..c5e364013 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -888,6 +888,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
              "Non-UD tags should use the `tag` property.")
     E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
+    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 2c3db2575..78d7a0be2 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -431,10 +431,16 @@ class EntityRuler(Pipe):
         path = ensure_path(path)
         self.clear()
         depr_patterns_path = path.with_suffix(".jsonl")
-        if depr_patterns_path.is_file():
+        if path.suffix == ".jsonl":  # user provides a jsonl
+            if path.is_file:
+                patterns = srsly.read_jsonl(path)
+                self.add_patterns(patterns)
+            else:
+                raise ValueError(Errors.E1023.format(path=path))
+        elif depr_patterns_path.is_file():
             patterns = srsly.read_jsonl(depr_patterns_path)
             self.add_patterns(patterns)
-        else:
+        elif path.is_dir():  # path is a valid directory
             cfg = {}
             deserializers_patterns = {
                 "patterns": lambda p: self.add_patterns(
@@ -451,6 +457,8 @@ class EntityRuler(Pipe):
                 self.nlp.vocab, attr=self.phrase_matcher_attr
             )
             from_disk(path, deserializers_patterns, {})
+        else:  # path is not a valid directory or file
+            raise ValueError(Errors.E146.format(path=path))
         return self
 
     def to_disk(
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index dc0ca0301..e66b49518 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -5,6 +5,8 @@ from spacy.tokens import Span
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
 from spacy.errors import MatchPatternError
+from spacy.tests.util import make_tempdir
+
 from thinc.api import NumpyOps, get_current_ops
 
 
@@ -238,3 +240,23 @@ def test_entity_ruler_multiprocessing(nlp, n_process):
         for doc in nlp.pipe(texts, n_process=2):
             for ent in doc.ents:
                 assert ent.ent_id_ == "1234"
+
+
+def test_entity_ruler_serialize_jsonl(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
+    with make_tempdir() as d:
+        ruler.to_disk(d / "test_ruler.jsonl")
+        ruler.from_disk(d / "test_ruler.jsonl")  # read from an existing jsonl file
+        with pytest.raises(ValueError):
+            ruler.from_disk(d / "non_existing.jsonl")  # read from a bad jsonl file
+
+
+def test_entity_ruler_serialize_dir(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
+    with make_tempdir() as d:
+        ruler.to_disk(d / "test_ruler")
+        ruler.from_disk(d / "test_ruler")  # read from an existing directory
+        with pytest.raises(ValueError):
+            ruler.from_disk(d / "non_existing_dir")  # read from a bad directory

From 0bbf86bba8f596f0cbf0132527ab2f767343c488 Mon Sep 17 00:00:00 2001
From: Valentin-Gabriel Soumah <60576980+Pantalaymon@users.noreply.github.com>
Date: Tue, 23 Nov 2021 17:29:23 +0100
Subject: [PATCH 083/133] Create Pantalaymon.md

Submitting agreement to spacy in order to contribute to Coreferee project .
---
 .github/contributors/Pantalaymon.md | 106 ++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/Pantalaymon.md

diff --git a/.github/contributors/Pantalaymon.md b/.github/contributors/Pantalaymon.md
new file mode 100644
index 000000000..f017f2947
--- /dev/null
+++ b/.github/contributors/Pantalaymon.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |Valentin-Gabriel Soumah|
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |    2021-11-23        |
+| GitHub username                |     Pantalaymon      |
+| Website (optional)             |                      |

From a4c43e5c577d7a143ef7e2fd74ccea33aace96b7 Mon Sep 17 00:00:00 2001
From: Natalia Rodnova <4512370+nrodnova@users.noreply.github.com>
Date: Wed, 24 Nov 2021 02:37:10 -0700
Subject: [PATCH 084/133] Allow Matcher to match on ENT_ID and ENT_KB_ID 
 (#9688)

* Added ENT_ID and ENT_KB_ID into the list of the attributes that Matcher matches on

* Added ENT_ID and ENT_KB_ID to TEST_PATTERNS in test_pattern_validation.py. Disabled tests that I added before

* Update website/docs/api/matcher.md

* Format

* Remove skipped tests

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/schemas.py                               | 2 ++
 spacy/tests/matcher/test_pattern_validation.py | 4 ++++
 website/docs/api/matcher.md                    | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index b3ea11d8b..cf58688ef 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -222,6 +222,8 @@ class TokenPattern(BaseModel):
     lemma: Optional[StringValue] = None
     shape: Optional[StringValue] = None
     ent_type: Optional[StringValue] = None
+    ent_id: Optional[StringValue] = None
+    ent_kb_id: Optional[StringValue] = None
     norm: Optional[StringValue] = None
     length: Optional[NumberValue] = None
     spacy: Optional[StrictBool] = None
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index 4d21aea81..74feb7c5d 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -22,6 +22,8 @@ TEST_PATTERNS = [
     ([{"TEXT": {"VALUE": "foo"}}], 2, 0),  # prev: (1, 0)
     ([{"IS_DIGIT": -1}], 1, 0),
     ([{"ORTH": -1}], 1, 0),
+    ([{"ENT_ID": -1}], 1, 0),
+    ([{"ENT_KB_ID": -1}], 1, 0),
     # Good patterns
     ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),
     ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),
@@ -33,6 +35,8 @@ TEST_PATTERNS = [
     ([{"orth": "foo"}], 0, 0),  # prev: xfail
     ([{"IS_SENT_START": True}], 0, 0),
     ([{"SENT_START": True}], 0, 0),
+    ([{"ENT_ID": "STRING"}], 0, 0),
+    ([{"ENT_KB_ID": "STRING"}], 0, 0),
 ]
 
 
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index c34560dec..803105ba2 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -44,6 +44,8 @@ rule-based matching are:
 | `SPACY`                                         | Token has a trailing space. ~~bool~~                                                                                      |
 |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
 | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `ENT_ID`                                        | The token's entity ID (`ent_id`). ~~str~~                                                                                 |
+| `ENT_KB_ID`                                     | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~                                                               |
 | `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
 | `OP`                                            | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           |
 

From 5c445332632079489acf214a675f0a193b383915 Mon Sep 17 00:00:00 2001
From: Tuomo Hiippala <tuomo.hiippala@iki.fi>
Date: Sun, 28 Nov 2021 12:33:16 +0200
Subject: [PATCH 085/133] add entry for Applied Language Technology under
 "Courses" (#9755)

Added the following entry into `universe.json`:

```
        {
            "type": "education",
            "id": "applt-course",
            "title": "Applied Language Technology",
            "slogan": "NLP for newcomers using spaCy and Stanza",
            "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
            "url": "https://applied-language-technology.readthedocs.io/",
            "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
            "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png",
            "author": "Tuomo Hiippala",
            "author_links": {
                "twitter": "tuomo_h",
                "github": "thiippal",
                "website": "https://www.mv.helsinki.fi/home/thiippal/"
            },
            "category": ["courses"]
        },
```
---
 website/meta/universe.json | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 7f3813a95..d11b0e8c5 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1752,6 +1752,23 @@
             },
             "category": ["courses"]
         },
+        {
+            "type": "education",
+            "id": "applt-course",
+            "title": "Applied Language Technology",
+            "slogan": "NLP for newcomers using spaCy and Stanza",
+            "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
+            "url": "https://applied-language-technology.readthedocs.io/",
+            "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
+            "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png",
+            "author": "Tuomo Hiippala",
+            "author_links": {
+                "twitter": "tuomo_h",
+                "github": "thiippal",
+                "website": "https://www.mv.helsinki.fi/home/thiippal/"
+            },
+            "category": ["courses"]
+        },
         {
             "type": "education",
             "id": "video-spacys-ner-model",

From 7b134b8fbd64bd8cfad2a0ecd7be9b6a7d7a907d Mon Sep 17 00:00:00 2001
From: Richard Hudson <richard@explosion.ai>
Date: Sun, 28 Nov 2021 21:59:23 +0100
Subject: [PATCH 086/133] New tests for a number of alpha languages (#9703)

* Added Slovak

* Added Slovenian tests

* Added Estonian tests

* Added Croatian tests

* Added Latvian tests

* Added Icelandic tests

* Added Afrikaans tests

* Added language-independent tests

* Added Kannada tests

* Tidied up

* Added Albanian tests

* Formatted with black

* Added failing tests for anomalies

* Update spacy/tests/lang/af/test_text.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Estonian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Croatian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Icelandic tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Latvian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovak tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovenian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/conftest.py               | 40 ++++++++++++++++++++++
 spacy/tests/lang/af/__init__.py       |  0
 spacy/tests/lang/af/test_text.py      | 22 ++++++++++++
 spacy/tests/lang/af/test_tokenizer.py | 29 ++++++++++++++++
 spacy/tests/lang/et/__init__.py       |  0
 spacy/tests/lang/et/test_text.py      | 26 +++++++++++++++
 spacy/tests/lang/et/test_tokenizer.py | 29 ++++++++++++++++
 spacy/tests/lang/hr/__init__.py       |  0
 spacy/tests/lang/hr/test_text.py      | 26 +++++++++++++++
 spacy/tests/lang/hr/test_tokenizer.py | 31 +++++++++++++++++
 spacy/tests/lang/is/__init__.py       |  0
 spacy/tests/lang/is/test_text.py      | 26 +++++++++++++++
 spacy/tests/lang/is/test_tokenizer.py | 30 +++++++++++++++++
 spacy/tests/lang/lv/__init__.py       |  0
 spacy/tests/lang/lv/test_text.py      | 27 +++++++++++++++
 spacy/tests/lang/lv/test_tokenizer.py | 30 +++++++++++++++++
 spacy/tests/lang/sk/__init__.py       |  0
 spacy/tests/lang/sk/test_text.py      | 48 +++++++++++++++++++++++++++
 spacy/tests/lang/sk/test_tokenizer.py | 15 +++++++++
 spacy/tests/lang/sl/__init__.py       |  0
 spacy/tests/lang/sl/test_text.py      | 27 +++++++++++++++
 spacy/tests/lang/sl/test_tokenizer.py | 32 ++++++++++++++++++
 spacy/tests/lang/sq/__init__.py       |  0
 spacy/tests/lang/sq/test_text.py      | 25 ++++++++++++++
 spacy/tests/lang/sq/test_tokenizer.py | 31 +++++++++++++++++
 spacy/tests/lang/xx/__init__.py       |  0
 spacy/tests/lang/xx/test_text.py      | 24 ++++++++++++++
 spacy/tests/lang/xx/test_tokenizer.py | 25 ++++++++++++++
 28 files changed, 543 insertions(+)
 create mode 100644 spacy/tests/lang/af/__init__.py
 create mode 100644 spacy/tests/lang/af/test_text.py
 create mode 100644 spacy/tests/lang/af/test_tokenizer.py
 create mode 100644 spacy/tests/lang/et/__init__.py
 create mode 100644 spacy/tests/lang/et/test_text.py
 create mode 100644 spacy/tests/lang/et/test_tokenizer.py
 create mode 100644 spacy/tests/lang/hr/__init__.py
 create mode 100644 spacy/tests/lang/hr/test_text.py
 create mode 100644 spacy/tests/lang/hr/test_tokenizer.py
 create mode 100644 spacy/tests/lang/is/__init__.py
 create mode 100644 spacy/tests/lang/is/test_text.py
 create mode 100644 spacy/tests/lang/is/test_tokenizer.py
 create mode 100644 spacy/tests/lang/lv/__init__.py
 create mode 100644 spacy/tests/lang/lv/test_text.py
 create mode 100644 spacy/tests/lang/lv/test_tokenizer.py
 create mode 100644 spacy/tests/lang/sk/__init__.py
 create mode 100644 spacy/tests/lang/sk/test_text.py
 create mode 100644 spacy/tests/lang/sk/test_tokenizer.py
 create mode 100644 spacy/tests/lang/sl/__init__.py
 create mode 100644 spacy/tests/lang/sl/test_text.py
 create mode 100644 spacy/tests/lang/sl/test_tokenizer.py
 create mode 100644 spacy/tests/lang/sq/__init__.py
 create mode 100644 spacy/tests/lang/sq/test_text.py
 create mode 100644 spacy/tests/lang/sq/test_tokenizer.py
 create mode 100644 spacy/tests/lang/xx/__init__.py
 create mode 100644 spacy/tests/lang/xx/test_text.py
 create mode 100644 spacy/tests/lang/xx/test_tokenizer.py

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 88c7adfe3..ffca79bb9 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -49,6 +49,11 @@ def tokenizer():
     return get_lang_class("xx")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def af_tokenizer():
+    return get_lang_class("af")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def am_tokenizer():
     return get_lang_class("am")().tokenizer
@@ -125,6 +130,11 @@ def es_vocab():
     return get_lang_class("es")().vocab
 
 
+@pytest.fixture(scope="session")
+def et_tokenizer():
+    return get_lang_class("et")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def eu_tokenizer():
     return get_lang_class("eu")().tokenizer
@@ -185,6 +195,11 @@ def id_tokenizer():
     return get_lang_class("id")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def is_tokenizer():
+    return get_lang_class("is")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def it_tokenizer():
     return get_lang_class("it")().tokenizer
@@ -212,6 +227,11 @@ def lt_tokenizer():
     return get_lang_class("lt")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def lv_tokenizer():
+    return get_lang_class("lv")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def mk_tokenizer():
     return get_lang_class("mk")().tokenizer
@@ -279,11 +299,26 @@ def sa_tokenizer():
     return get_lang_class("sa")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def sk_tokenizer():
+    return get_lang_class("sk")().tokenizer
+
+
+@pytest.fixture(scope="session")
+def sl_tokenizer():
+    return get_lang_class("sl")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def sr_tokenizer():
     return get_lang_class("sr")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def sq_tokenizer():
+    return get_lang_class("sq")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def sv_tokenizer():
     return get_lang_class("sv")().tokenizer
@@ -344,6 +379,11 @@ def vi_tokenizer():
     return get_lang_class("vi")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def xx_tokenizer():
+    return get_lang_class("xx")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def yo_tokenizer():
     return get_lang_class("yo")().tokenizer
diff --git a/spacy/tests/lang/af/__init__.py b/spacy/tests/lang/af/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/af/test_text.py b/spacy/tests/lang/af/test_text.py
new file mode 100644
index 000000000..99c2a9f4c
--- /dev/null
+++ b/spacy/tests/lang/af/test_text.py
@@ -0,0 +1,22 @@
+import pytest
+
+
+def test_long_text(af_tokenizer):
+    # Excerpt: Universal Declaration of Human Rights; “'n” changed to “die” in first sentence
+    text = """
+Hierdie Universele Verklaring van Menseregte as die algemene standaard vir die verwesenliking deur alle mense en nasies, 
+om te verseker dat elke individu en elke deel van die gemeenskap hierdie Verklaring in ag sal neem en deur opvoeding, 
+respek vir hierdie regte en vryhede te bevorder, op nasionale en internasionale vlak, daarna sal strewe om die universele 
+en effektiewe erkenning en agting van hierdie regte te verseker, nie net vir die mense van die Lidstate nie, maar ook vir 
+die mense in die gebiede onder hul jurisdiksie.
+
+"""
+    tokens = af_tokenizer(text)
+    assert len(tokens) == 100
+
+
+@pytest.mark.xfail
+def test_indefinite_article(af_tokenizer):
+    text = "as 'n algemene standaard"
+    tokens = af_tokenizer(text)
+    assert len(tokens) == 4
diff --git a/spacy/tests/lang/af/test_tokenizer.py b/spacy/tests/lang/af/test_tokenizer.py
new file mode 100644
index 000000000..db52db5e3
--- /dev/null
+++ b/spacy/tests/lang/af/test_tokenizer.py
@@ -0,0 +1,29 @@
+import pytest
+
+AF_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Elkeen het die reg tot lewe, vryheid en sekuriteit van persoon.",
+        [
+            "Elkeen",
+            "het",
+            "die",
+            "reg",
+            "tot",
+            "lewe",
+            ",",
+            "vryheid",
+            "en",
+            "sekuriteit",
+            "van",
+            "persoon",
+            ".",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", AF_BASIC_TOKENIZATION_TESTS)
+def test_af_tokenizer_basic(af_tokenizer, text, expected_tokens):
+    tokens = af_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
diff --git a/spacy/tests/lang/et/__init__.py b/spacy/tests/lang/et/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/et/test_text.py b/spacy/tests/lang/et/test_text.py
new file mode 100644
index 000000000..9515a7cc1
--- /dev/null
+++ b/spacy/tests/lang/et/test_text.py
@@ -0,0 +1,26 @@
+import pytest
+
+
+def test_long_text(et_tokenizer):
+    # Excerpt: European Convention on Human Rights
+    text = """
+arvestades, et nimetatud deklaratsiooni eesmärk on tagada selles
+kuulutatud õiguste üldine ja tõhus tunnustamine ning järgimine;
+arvestades, et Euroopa Nõukogu eesmärk on saavutada tema
+liikmete suurem ühtsus ning et üheks selle eesmärgi saavutamise
+vahendiks on inimõiguste ja põhivabaduste järgimine ning
+elluviimine;
+taaskinnitades oma sügavat usku neisse põhivabadustesse, mis
+on õigluse ja rahu aluseks maailmas ning mida kõige paremini
+tagab ühelt poolt tõhus poliitiline demokraatia ning teiselt poolt
+inimõiguste, millest nad sõltuvad, üldine mõistmine ja järgimine;
+"""
+    tokens = et_tokenizer(text)
+    assert len(tokens) == 94
+
+
+@pytest.mark.xfail
+def test_ordinal_number(et_tokenizer):
+    text = "10. detsembril 1948"
+    tokens = et_tokenizer(text)
+    assert len(tokens) == 3
diff --git a/spacy/tests/lang/et/test_tokenizer.py b/spacy/tests/lang/et/test_tokenizer.py
new file mode 100644
index 000000000..f0f8079ca
--- /dev/null
+++ b/spacy/tests/lang/et/test_tokenizer.py
@@ -0,0 +1,29 @@
+import pytest
+
+ET_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Kedagi ei või piinata ega ebainimlikult või alandavalt kohelda "
+        "ega karistada.",
+        [
+            "Kedagi",
+            "ei",
+            "või",
+            "piinata",
+            "ega",
+            "ebainimlikult",
+            "või",
+            "alandavalt",
+            "kohelda",
+            "ega",
+            "karistada",
+            ".",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", ET_BASIC_TOKENIZATION_TESTS)
+def test_et_tokenizer_basic(et_tokenizer, text, expected_tokens):
+    tokens = et_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
diff --git a/spacy/tests/lang/hr/__init__.py b/spacy/tests/lang/hr/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/hr/test_text.py b/spacy/tests/lang/hr/test_text.py
new file mode 100644
index 000000000..82e65afe7
--- /dev/null
+++ b/spacy/tests/lang/hr/test_text.py
@@ -0,0 +1,26 @@
+import pytest
+
+
+def test_long_text(hr_tokenizer):
+    # Excerpt: European Convention on Human Rights
+    text = """
+uzimajući u obzir da ta deklaracija nastoji osigurati opće i djelotvorno
+priznanje i poštovanje u njoj proglašenih prava;
+uzimajući u obzir da je cilj Vijeća Europe postizanje većeg jedinstva
+njegovih članica, i da je jedan od načina postizanja toga cilja
+očuvanje i daljnje ostvarivanje ljudskih prava i temeljnih sloboda;
+potvrđujući svoju duboku privrženost tim temeljnim slobodama
+koje su osnova pravde i mira u svijetu i koje su najbolje zaštićene
+istinskom političkom demokracijom s jedne strane te zajedničkim
+razumijevanjem i poštovanjem ljudskih prava o kojima te slobode
+ovise s druge strane;
+"""
+    tokens = hr_tokenizer(text)
+    assert len(tokens) == 105
+
+
+@pytest.mark.xfail
+def test_ordinal_number(hr_tokenizer):
+    text = "10. prosinca 1948"
+    tokens = hr_tokenizer(text)
+    assert len(tokens) == 3
diff --git a/spacy/tests/lang/hr/test_tokenizer.py b/spacy/tests/lang/hr/test_tokenizer.py
new file mode 100644
index 000000000..dace33b2d
--- /dev/null
+++ b/spacy/tests/lang/hr/test_tokenizer.py
@@ -0,0 +1,31 @@
+import pytest
+
+HR_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Nitko se ne smije podvrgnuti mučenju ni nečovječnom ili "
+        "ponižavajućem postupanju ili kazni.",
+        [
+            "Nitko",
+            "se",
+            "ne",
+            "smije",
+            "podvrgnuti",
+            "mučenju",
+            "ni",
+            "nečovječnom",
+            "ili",
+            "ponižavajućem",
+            "postupanju",
+            "ili",
+            "kazni",
+            ".",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", HR_BASIC_TOKENIZATION_TESTS)
+def test_hr_tokenizer_basic(hr_tokenizer, text, expected_tokens):
+    tokens = hr_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
diff --git a/spacy/tests/lang/is/__init__.py b/spacy/tests/lang/is/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/is/test_text.py b/spacy/tests/lang/is/test_text.py
new file mode 100644
index 000000000..6e3654a6e
--- /dev/null
+++ b/spacy/tests/lang/is/test_text.py
@@ -0,0 +1,26 @@
+import pytest
+
+
+def test_long_text(is_tokenizer):
+    # Excerpt: European Convention on Human Rights
+    text = """
+hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja
+almenna og raunhæfa viðurkenningu og vernd þeirra réttinda,
+sem þar er lýst;
+hafa í huga, að markmið Evrópuráðs er að koma á nánari einingu
+aðildarríkjanna og að ein af leiðunum að því marki er sú, að
+mannréttindi og mannfrelsi séu í heiðri höfð og efld;
+lýsa á ný eindreginni trú sinni á það mannfrelsi, sem er undirstaða
+réttlætis og friðar í heiminum og best er tryggt, annars vegar með
+virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
+og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
+"""
+    tokens = is_tokenizer(text)
+    assert len(tokens) == 120
+
+
+@pytest.mark.xfail
+def test_ordinal_number(is_tokenizer):
+    text = "10. desember 1948"
+    tokens = is_tokenizer(text)
+    assert len(tokens) == 3
diff --git a/spacy/tests/lang/is/test_tokenizer.py b/spacy/tests/lang/is/test_tokenizer.py
new file mode 100644
index 000000000..0c05a6050
--- /dev/null
+++ b/spacy/tests/lang/is/test_tokenizer.py
@@ -0,0 +1,30 @@
+import pytest
+
+IS_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Enginn maður skal sæta pyndingum eða ómannlegri eða "
+        "vanvirðandi meðferð eða refsingu. ",
+        [
+            "Enginn",
+            "maður",
+            "skal",
+            "sæta",
+            "pyndingum",
+            "eða",
+            "ómannlegri",
+            "eða",
+            "vanvirðandi",
+            "meðferð",
+            "eða",
+            "refsingu",
+            ".",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS)
+def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens):
+    tokens = is_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
diff --git a/spacy/tests/lang/lv/__init__.py b/spacy/tests/lang/lv/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/lv/test_text.py b/spacy/tests/lang/lv/test_text.py
new file mode 100644
index 000000000..5ca5fd0a7
--- /dev/null
+++ b/spacy/tests/lang/lv/test_text.py
@@ -0,0 +1,27 @@
+import pytest
+
+
+def test_long_text(lv_tokenizer):
+    # Excerpt: European Convention on Human Rights
+    text = """
+Ievērodamas, ka šī deklarācija paredz nodrošināt vispārēju un
+efektīvu tajā pasludināto tiesību atzīšanu un ievērošanu;
+Ievērodamas, ka Eiropas Padomes mērķis ir panākt lielāku vienotību
+tās dalībvalstu starpā un ka viens no līdzekļiem, kā šo mērķi
+sasniegt, ir cilvēka tiesību un pamatbrīvību ievērošana un turpmāka
+īstenošana;
+No jauna apliecinādamas patiesu pārliecību, ka šīs pamatbrīvības
+ir taisnīguma un miera pamats visā pasaulē un ka tās vislabāk var
+nodrošināt patiess demokrātisks politisks režīms no vienas puses un
+vispārējo cilvēktiesību, uz kurām tās pamatojas, kopīga izpratne un
+ievērošana no otras puses;
+"""
+    tokens = lv_tokenizer(text)
+    assert len(tokens) == 109
+
+
+@pytest.mark.xfail
+def test_ordinal_number(lv_tokenizer):
+    text = "10. decembrī"
+    tokens = lv_tokenizer(text)
+    assert len(tokens) == 2
diff --git a/spacy/tests/lang/lv/test_tokenizer.py b/spacy/tests/lang/lv/test_tokenizer.py
new file mode 100644
index 000000000..3ce7ad5fa
--- /dev/null
+++ b/spacy/tests/lang/lv/test_tokenizer.py
@@ -0,0 +1,30 @@
+import pytest
+
+LV_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Nevienu nedrīkst spīdzināt vai cietsirdīgi vai pazemojoši ar viņu "
+        "apieties vai sodīt.",
+        [
+            "Nevienu",
+            "nedrīkst",
+            "spīdzināt",
+            "vai",
+            "cietsirdīgi",
+            "vai",
+            "pazemojoši",
+            "ar",
+            "viņu",
+            "apieties",
+            "vai",
+            "sodīt",
+            ".",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", LV_BASIC_TOKENIZATION_TESTS)
+def test_lv_tokenizer_basic(lv_tokenizer, text, expected_tokens):
+    tokens = lv_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
diff --git a/spacy/tests/lang/sk/__init__.py b/spacy/tests/lang/sk/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/sk/test_text.py b/spacy/tests/lang/sk/test_text.py
new file mode 100644
index 000000000..62ea2a783
--- /dev/null
+++ b/spacy/tests/lang/sk/test_text.py
@@ -0,0 +1,48 @@
+import pytest
+
+
+def test_long_text(sk_tokenizer):
+    # Excerpt: European Convention on Human Rights
+    text = """
+majúc na zreteli, že cieľom tejto deklarácie je zabezpečiť všeobecné
+a účinné uznávanie a dodržiavanie práv v nej vyhlásených;
+majúc na zreteli, že cieľom Rady Európy je dosiahnutie väčšej
+jednoty medzi jej členmi, a že jedným zo spôsobov, ktorým sa
+má tento cieľ napĺňať, je ochrana a ďalší rozvoj ľudských práv
+a základných slobôd;
+znovu potvrdzujúc svoju hlbokú vieru v tie základné slobody, ktoré
+sú základom spravodlivosti a mieru vo svete, a ktoré sú najlepšie
+zachovávané na jednej strane účinnou politickou demokraciou
+a na strane druhej spoločným poňatím a dodržiavaním ľudských
+práv, od ktorých závisia;
+    """
+    tokens = sk_tokenizer(text)
+    assert len(tokens) == 118
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10,000", True),
+        ("10,00", True),
+        ("štyri", True),
+        ("devätnásť", True),
+        ("milión", True),
+        ("pes", False),
+        (",", False),
+        ("1/2", True),
+    ],
+)
+def test_lex_attrs_like_number(sk_tokenizer, text, match):
+    tokens = sk_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
+
+
+@pytest.mark.xfail
+def test_ordinal_number(sk_tokenizer):
+    text = "10. decembra 1948"
+    tokens = sk_tokenizer(text)
+    assert len(tokens) == 3
diff --git a/spacy/tests/lang/sk/test_tokenizer.py b/spacy/tests/lang/sk/test_tokenizer.py
new file mode 100644
index 000000000..247847284
--- /dev/null
+++ b/spacy/tests/lang/sk/test_tokenizer.py
@@ -0,0 +1,15 @@
+import pytest
+
+SK_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Kedy sa narodil Andrej Kiska?",
+        ["Kedy", "sa", "narodil", "Andrej", "Kiska", "?"],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", SK_BASIC_TOKENIZATION_TESTS)
+def test_sk_tokenizer_basic(sk_tokenizer, text, expected_tokens):
+    tokens = sk_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
diff --git a/spacy/tests/lang/sl/__init__.py b/spacy/tests/lang/sl/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/sl/test_text.py b/spacy/tests/lang/sl/test_text.py
new file mode 100644
index 000000000..ddc5b6b5d
--- /dev/null
+++ b/spacy/tests/lang/sl/test_text.py
@@ -0,0 +1,27 @@
+import pytest
+
+
+def test_long_text(sl_tokenizer):
+    # Excerpt: European Convention on Human Rights
+    text = """
+upoštevajoč, da si ta deklaracija prizadeva zagotoviti splošno in
+učinkovito priznavanje in spoštovanje v njej razglašenih pravic,
+upoštevajoč, da je cilj Sveta Evrope doseči večjo enotnost med
+njegovimi članicami, in da je eden izmed načinov za zagotavljanje
+tega cilja varstvo in nadaljnji razvoj človekovih pravic in temeljnih
+svoboščin,
+ponovno potrjujoč svojo globoko vero v temeljne svoboščine, na
+katerih temeljita pravičnost in mir v svetu, in ki jih je mogoče najbolje
+zavarovati na eni strani z dejansko politično demokracijo in na drugi
+strani s skupnim razumevanjem in spoštovanjem človekovih pravic,
+od katerih so te svoboščine odvisne,    
+"""
+    tokens = sl_tokenizer(text)
+    assert len(tokens) == 116
+
+
+@pytest.mark.xfail
+def test_ordinal_number(sl_tokenizer):
+    text = "10. decembra 1948"
+    tokens = sl_tokenizer(text)
+    assert len(tokens) == 3
diff --git a/spacy/tests/lang/sl/test_tokenizer.py b/spacy/tests/lang/sl/test_tokenizer.py
new file mode 100644
index 000000000..f2b15b0ff
--- /dev/null
+++ b/spacy/tests/lang/sl/test_tokenizer.py
@@ -0,0 +1,32 @@
+import pytest
+
+SL_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Vsakdo ima pravico do spoštovanja njegovega zasebnega in "
+        "družinskega življenja, doma in dopisovanja.",
+        [
+            "Vsakdo",
+            "ima",
+            "pravico",
+            "do",
+            "spoštovanja",
+            "njegovega",
+            "zasebnega",
+            "in",
+            "družinskega",
+            "življenja",
+            ",",
+            "doma",
+            "in",
+            "dopisovanja",
+            ".",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", SL_BASIC_TOKENIZATION_TESTS)
+def test_sl_tokenizer_basic(sl_tokenizer, text, expected_tokens):
+    tokens = sl_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
diff --git a/spacy/tests/lang/sq/__init__.py b/spacy/tests/lang/sq/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/sq/test_text.py b/spacy/tests/lang/sq/test_text.py
new file mode 100644
index 000000000..44eedaa54
--- /dev/null
+++ b/spacy/tests/lang/sq/test_text.py
@@ -0,0 +1,25 @@
+import pytest
+
+
+def test_long_text(sq_tokenizer):
+    # Excerpt: European Convention on Human Rights
+    text = """
+Qeveritë nënshkruese, anëtare të Këshillit të Evropës,
+Duke pasur parasysh Deklaratën Universale të të Drejtave të
+Njeriut, të shpallur nga Asambleja e Përgjithshme e Kombeve të
+Bashkuara më 10 dhjetor 1948;
+Duke pasur parasysh, se kjo Deklaratë ka për qëllim të sigurojë
+njohjen dhe zbatimin universal dhe efektiv të të drejtave të
+shpallura në të;
+Duke pasur parasysh se qëllimi i Këshillit të Evropës është që të
+realizojë një bashkim më të ngushtë midis anëtarëve të tij dhe
+se një nga mjetet për të arritur këtë qëllim është mbrojtja dhe
+zhvillimi i të drejtave të njeriut dhe i lirive themelore;
+Duke ripohuar besimin e tyre të thellë në këto liri themelore që
+përbëjnë themelet e drejtësisë dhe të paqes në botë, ruajtja e të
+cilave mbështetet kryesisht mbi një regjim politik demokratik nga
+njëra anë, dhe nga ana tjetër mbi një kuptim dhe respektim të
+përbashkët të të drejtave të njeriut nga të cilat varen;
+"""
+    tokens = sq_tokenizer(text)
+    assert len(tokens) == 182
diff --git a/spacy/tests/lang/sq/test_tokenizer.py b/spacy/tests/lang/sq/test_tokenizer.py
new file mode 100644
index 000000000..8fd25f588
--- /dev/null
+++ b/spacy/tests/lang/sq/test_tokenizer.py
@@ -0,0 +1,31 @@
+import pytest
+
+SQ_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Askush nuk mund t’i nënshtrohet torturës ose dënimeve ose "
+        "trajtimeve çnjerëzore ose poshtëruese.",
+        [
+            "Askush",
+            "nuk",
+            "mund",
+            "t’i",
+            "nënshtrohet",
+            "torturës",
+            "ose",
+            "dënimeve",
+            "ose",
+            "trajtimeve",
+            "çnjerëzore",
+            "ose",
+            "poshtëruese",
+            ".",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", SQ_BASIC_TOKENIZATION_TESTS)
+def test_sq_tokenizer_basic(sq_tokenizer, text, expected_tokens):
+    tokens = sq_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
diff --git a/spacy/tests/lang/xx/__init__.py b/spacy/tests/lang/xx/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/xx/test_text.py
new file mode 100644
index 000000000..477f0ebe2
--- /dev/null
+++ b/spacy/tests/lang/xx/test_text.py
@@ -0,0 +1,24 @@
+import pytest
+
+
+def test_long_text(xx_tokenizer):
+    # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
+    text = """
+Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.  
+Alggmeer kriteeʹr vuâđđâʹvve meeraikõskksaž tuâjjorganisaatio, ILO, suåppmõʹšše nââmar 169. 
+Suåppmõõžž mieʹldd jiõččvälddsaž jânnmin jälsteei meeraid ââʹnet alggmeeran, 
+ko sij puõlvvâʹvve naroodâst, kååʹtt jânnam välddmõõžž leʹbe aazztummuž leʹbe ânnʼjõž riikkraaʹji šõddâm ääiʹj jälste 
+jânnmest leʹbe tõn mäddtiõđlaž vuuʹdest, koozz jânnam kooll. Alggmeer ij leäkku mieʹrreei sââʹjest jiiʹjjes jälstemvuuʹdest. 
+Alggmeer âlgg jiõčč ââʹnned jiiʹjjes alggmeeran leʹbe leeʹd tõn miõlâst, što sij lie alggmeer. 
+Alggmeer lij õlggâm seeilted vuõiggâdvuõđlaž sââʹjest huõlǩâni obbnes leʹbe vueʹzzi jiiʹjjes sosiaalʼlaž, täälʼlaž, 
+kulttuurlaž da poliittlaž instituutioid.
+
+Säʹmmlai statuuzz ǩeeʹrjteš Lääʹddjânnam vuâđđläkka eeʹjj 1995. Säʹmmlain alggmeeran lij vuõiggâdvuõtt tuõʹllʼjed da 
+ooudâsviikkâd ǩiõlâz da kulttuurâz di tõõzz kuulli ääʹrbvuâlaž jieʹllemvueʹjjeez. Sääʹmǩiõl ââʹnnmest veʹrǧǧniiʹǩǩi 
+åʹrnn lij šiõttuum jiiʹjjes lääʹǩǩ. Säʹmmlain lij leämmaž eeʹjjest 1996 vueʹljeeʹl dommvuuʹdsteez ǩiõlâz da kulttuurâz kuõskki 
+vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuulli tuâjaid håidd säʹmmlai vaalin vaʹlljääm parlameʹntt, 
+Sääʹmteʹǧǧ.
+"""
+
+    tokens = xx_tokenizer(text)
+    assert len(tokens) == 179
diff --git a/spacy/tests/lang/xx/test_tokenizer.py b/spacy/tests/lang/xx/test_tokenizer.py
new file mode 100644
index 000000000..15c760a6b
--- /dev/null
+++ b/spacy/tests/lang/xx/test_tokenizer.py
@@ -0,0 +1,25 @@
+import pytest
+
+XX_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
+        [
+            "Lääʹddjânnmest",
+            "lie",
+            "nuʹtt",
+            "10",
+            "000",
+            "säʹmmliʹžžed",
+            ".",
+            "Seeʹst",
+            "pâʹjjel",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
+def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
+    tokens = xx_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list

From ac05de2c6c708e33ebad6c901e674e1e8bdc0688 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 29 Nov 2021 07:31:02 +0000
Subject: [PATCH 087/133] Fix Language-specific factory handling in package
 command (#9674)

* Use internal names for factories

If a component factory is registered like `@French.factory(...)` instead
of `@Language.factory(...)`, the name in the factories registry will be
prefixed with the language code. However in the nlp.config object the
factory will be listed without the language code. The `add_pipe` code
has fallback logic to handle this, but packaging code and the registry
itself don't.

This change makes it so that the factory name in nlp.config is the
language-specific form. It's not clear if this will break anything else,
but it does seem to fix the inconsistency and resolve the specific user
issue that brought this to our attention.

* Change approach to use fallback in package lookup

This adds fallback logic to the package lookup, so it doesn't have to
touch the way the config is built. It seems to fix the tests too.

* Remove unecessary line

* Add test

Thsi also adds an assert that seems to have been forgotten.
---
 spacy/cli/package.py    | 12 +++++++++++-
 spacy/tests/test_cli.py | 11 ++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 76e14daf5..f9d2a9af2 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -4,6 +4,7 @@ from pathlib import Path
 from wasabi import Printer, MarkdownRenderer, get_raw_input
 from thinc.api import Config
 from collections import defaultdict
+from catalogue import RegistryError
 import srsly
 import sys
 
@@ -212,9 +213,18 @@ def get_third_party_dependencies(
         if "factory" in component:
             funcs["factories"].add(component["factory"])
     modules = set()
+    lang = config["nlp"]["lang"]
     for reg_name, func_names in funcs.items():
         for func_name in func_names:
-            func_info = util.registry.find(reg_name, func_name)
+            # Try the lang-specific version and fall back
+            try:
+                func_info = util.registry.find(reg_name, lang + "." + func_name)
+            except RegistryError:
+                try:
+                    func_info = util.registry.find(reg_name, func_name)
+                except RegistryError as regerr:
+                    # lang-specific version being absent is not actually an issue
+                    raise regerr from None
             module_name = func_info.get("module")  # type: ignore[attr-defined]
             if module_name:  # the code is part of a module, not a --code file
                 modules.add(func_info["module"].split(".")[0])  # type: ignore[index]
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 3243d426b..c6b00b140 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -565,7 +565,16 @@ def test_get_third_party_dependencies():
             }
         },
     )
-    get_third_party_dependencies(nlp.config) == []
+    assert get_third_party_dependencies(nlp.config) == []
+
+    # Test with lang-specific factory
+    @Dutch.factory("third_party_test")
+    def test_factory(nlp, name):
+        return lambda x: x
+
+    nlp.add_pipe("third_party_test")
+    # Before #9674 this would throw an exception
+    get_third_party_dependencies(nlp.config)
 
 
 @pytest.mark.parametrize(

From 6763cbfdc03ed801576c99a5d35623cf55925e22 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Nov 2021 14:14:21 +0100
Subject: [PATCH 088/133] Update Catalan acknowledgements for v3.2 (#9763)

---
 website/docs/usage/v3-2.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/v3-2.md b/website/docs/usage/v3-2.md
index 766d1c0a9..d1d45c7ba 100644
--- a/website/docs/usage/v3-2.md
+++ b/website/docs/usage/v3-2.md
@@ -159,7 +159,7 @@ their contributions!
 
 - All Universal Dependencies training data has been updated to v2.8.
 - The Catalan data, tokenizer and lemmatizer have been updated, thanks to Carlos
-  Rodriguez and the Barcelona Supercomputing Center!
+  Rodriguez, Carme Armentano and the Barcelona Supercomputing Center!
 - The transformer pipelines are trained using spacy-transformers v1.1, with
   improved IO and more options for
   [model config and output](/api/architectures#TransformerModel).

From 1be8a4dab305466cc731f1bd9124ae13df274d54 Mon Sep 17 00:00:00 2001
From: Narayan Acharya <narayan.acharya6@gmail.com>
Date: Mon, 29 Nov 2021 11:13:26 -0500
Subject: [PATCH 089/133] Displacy serve entity linking support without
 `manual=True` support. (#9748)

* Add support for kb_id to be displayed via displacy.serve. The current support is only limited to the manual option in displacy.render

* Commit to check pre-commit hooks are run.

* Update spacy/displacy/__init__.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Changes as per suggestions on the PR.

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* tag option as new from 3.2.1 onwards

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
---
 spacy/displacy/__init__.py    | 12 ++++++++++--
 spacy/tests/test_displacy.py  | 36 +++++++++++++++++++++++++++++++++--
 website/docs/api/top-level.md | 26 ++++++++++++++++---------
 3 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index d9418f675..25d530c83 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -181,11 +181,19 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
 def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
     """Generate named entities in [{start: i, end: i, label: 'label'}] format.
 
-    doc (Doc): Document do parse.
+    doc (Doc): Document to parse.
+    options (Dict[str, Any]): NER-specific visualisation options.
     RETURNS (dict): Generated entities keyed by text (original text) and ents.
     """
+    kb_url_template = options.get("kb_url_template", None)
     ents = [
-        {"start": ent.start_char, "end": ent.end_char, "label": ent.label_}
+        {
+            "start": ent.start_char,
+            "end": ent.end_char,
+            "label": ent.label_,
+            "kb_id": ent.kb_id_ if ent.kb_id_ else "",
+            "kb_url": kb_url_template.format(ent.kb_id_) if kb_url_template else "#",
+        }
         for ent in doc.ents
     ]
     if not ents:
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index 040dd657f..790925888 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -1,8 +1,9 @@
 import pytest
+
 from spacy import displacy
 from spacy.displacy.render import DependencyRenderer, EntityRenderer
-from spacy.tokens import Span, Doc
 from spacy.lang.fa import Persian
+from spacy.tokens import Span, Doc
 
 
 def test_displacy_parse_ents(en_vocab):
@@ -12,7 +13,38 @@ def test_displacy_parse_ents(en_vocab):
     ents = displacy.parse_ents(doc)
     assert isinstance(ents, dict)
     assert ents["text"] == "But Google is starting from behind "
-    assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}]
+    assert ents["ents"] == [
+        {"start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#"}
+    ]
+
+    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
+    ents = displacy.parse_ents(doc)
+    assert isinstance(ents, dict)
+    assert ents["text"] == "But Google is starting from behind "
+    assert ents["ents"] == [
+        {"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#"}
+    ]
+
+
+def test_displacy_parse_ents_with_kb_id_options(en_vocab):
+    """Test that named entities with kb_id on a Doc are converted into displaCy's format."""
+    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
+
+    ents = displacy.parse_ents(
+        doc, {"kb_url_template": "https://www.wikidata.org/wiki/{}"}
+    )
+    assert isinstance(ents, dict)
+    assert ents["text"] == "But Google is starting from behind "
+    assert ents["ents"] == [
+        {
+            "start": 4,
+            "end": 10,
+            "label": "ORG",
+            "kb_id": "Q95",
+            "kb_url": "https://www.wikidata.org/wiki/Q95",
+        }
+    ]
 
 
 def test_displacy_parse_deps(en_vocab):
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 4361db4c0..be19f9c3a 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -313,11 +313,12 @@ If a setting is not present in the options, the default value will be used.
 > displacy.serve(doc, style="ent", options=options)
 > ```
 
-| Name                                    | Description                                                                                                                                                                                                                                 |
-| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `ents`                                  | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~                                                                                                                                                        |
-| `colors`                                | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~                                                                                                                                                 |
-| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
+| Name                                             | Description                                                                                                                                                                                                                                 |
+| ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `ents`                                           | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~                                                                                                                                                        |
+| `colors`                                         | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~                                                                                                                                                 |
+| `template` <Tag variant="new">2.2</Tag>          | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
+| `kb_url_template` <Tag variant="new">3.2.1</Tag> | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in. ~~Optional[str]~~                                                                                       |
 
 By default, displaCy comes with colors for all entity types used by
 [spaCy's trained pipelines](/models). If you're using custom entity types, you
@@ -326,6 +327,14 @@ or pipeline package can also expose a
 [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
 to add custom labels and their colors automatically.
 
+By default, displaCy links to `#` for entities without a `kb_id` set on their
+span. If you wish to link an entity to their URL then consider using the
+`kb_url_template` option from above. For example if the `kb_id` on a span is
+`Q95` and this is a Wikidata identifier then this option can be set to
+`https://www.wikidata.org/wiki/{}`. Clicking on your entity in the rendered HTML
+should redirect you to their Wikidata page, in this case
+`https://www.wikidata.org/wiki/Q95`.
+
 ## registry {#registry source="spacy/util.py" new="3"}
 
 spaCy's function registry extends
@@ -412,10 +421,10 @@ finished. To log each training step, a
 and the accuracy scores on the development set.
 
 The built-in, default logger is the ConsoleLogger, which prints results to the
-console in tabular format. The 
+console in tabular format. The
 [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
-a dependency of spaCy, enables other loggers: currently it provides one that sends
-results to a [Weights & Biases](https://www.wandb.com/) dashboard.
+a dependency of spaCy, enables other loggers: currently it provides one that
+sends results to a [Weights & Biases](https://www.wandb.com/) dashboard.
 
 Instead of using one of the built-in loggers, you can
 [implement your own](/usage/training#custom-logging).
@@ -466,7 +475,6 @@ start decreasing across epochs.
 
  </Accordion>
 
-
 ## Readers {#readers}
 
 ### File readers {#file-readers source="github.com/explosion/srsly" new="3"}

From c19f0c1604f7141a050292bf79d6eae3997b18c5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 30 Nov 2021 10:08:51 +0100
Subject: [PATCH 090/133] Switch to latest CI images (#9773)

---
 azure-pipelines.yml | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 4291b6e0a..71a793911 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -23,7 +23,7 @@ jobs:
   # defined in .flake8 and overwrites the selected codes.
   - job: "Validate"
     pool:
-      vmImage: "ubuntu-18.04"
+      vmImage: "ubuntu-latest"
     steps:
       - task: UsePythonVersion@0
         inputs:
@@ -39,49 +39,49 @@ jobs:
       matrix:
         # We're only running one platform per Python version to speed up builds
         Python36Linux:
-          imageName: "ubuntu-18.04"
+          imageName: "ubuntu-latest"
           python.version: "3.6"
         #        Python36Windows:
-        #          imageName: "windows-2019"
+        #          imageName: "windows-latest"
         #          python.version: "3.6"
         #        Python36Mac:
-        #          imageName: "macos-10.14"
+        #          imageName: "macos-latest"
         #          python.version: "3.6"
         #        Python37Linux:
-        #          imageName: "ubuntu-18.04"
+        #          imageName: "ubuntu-latest"
         #          python.version: "3.7"
         Python37Windows:
-          imageName: "windows-2019"
+          imageName: "windows-latest"
           python.version: "3.7"
         #        Python37Mac:
-        #          imageName: "macos-10.14"
+        #          imageName: "macos-latest"
         #          python.version: "3.7"
         #        Python38Linux:
-        #          imageName: "ubuntu-18.04"
+        #          imageName: "ubuntu-latest"
         #          python.version: "3.8"
         #        Python38Windows:
-        #          imageName: "windows-2019"
+        #          imageName: "windows-latest"
         #          python.version: "3.8"
         Python38Mac:
-          imageName: "macos-10.14"
+          imageName: "macos-latest"
           python.version: "3.8"
         Python39Linux:
-          imageName: "ubuntu-18.04"
+          imageName: "ubuntu-latest"
           python.version: "3.9"
         #        Python39Windows:
-        #          imageName: "windows-2019"
+        #          imageName: "windows-latest"
         #          python.version: "3.9"
         #        Python39Mac:
-        #          imageName: "macos-10.14"
+        #          imageName: "macos-latest"
         #          python.version: "3.9"
         Python310Linux:
-          imageName: "ubuntu-20.04"
+          imageName: "ubuntu-latest"
           python.version: "3.10"
         Python310Windows:
-          imageName: "windows-2019"
+          imageName: "windows-latest"
           python.version: "3.10"
         Python310Mac:
-          imageName: "macos-10.15"
+          imageName: "macos-latest"
           python.version: "3.10"
       maxParallel: 4
     pool:

From 72f7f4e68a5076a87dd9402812bfb72e479237ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 30 Nov 2021 11:58:59 +0100
Subject: [PATCH 091/133] morphologizer: avoid recreating label tuple for each
 token (#9764)

* morphologizer: avoid recreating label tuple for each token

The `labels` property converts the dictionary key set to a tuple. This
property was used for every annotated token, recreating the tuple over
and over again.

Construct the tuple once in the set_annotations function and reuse it.

On a Finnish pipeline that I was experimenting with, this results in a
speedup of ~15% (~13000 -> ~15000 WPS).

* tagger: avoid recreating label tuple for each token
---
 spacy/pipeline/morphologizer.pyx | 3 ++-
 spacy/pipeline/tagger.pyx        | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index db425b69a..73d3799b1 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -231,12 +231,13 @@ class Morphologizer(Tagger):
         cdef Vocab vocab = self.vocab
         cdef bint overwrite = self.cfg["overwrite"]
         cdef bint extend = self.cfg["extend"]
+        labels = self.labels
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
-                morph = self.labels[tag_id]
+                morph = labels[tag_id]
                 # set morph
                 if doc.c[j].morph == 0 or overwrite or extend:
                     if overwrite and extend:
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index a9cbac37a..c0768dfec 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -166,13 +166,14 @@ class Tagger(TrainablePipe):
         cdef Doc doc
         cdef Vocab vocab = self.vocab
         cdef bint overwrite = self.cfg["overwrite"]
+        labels = self.labels
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
                 if doc.c[j].tag == 0 or overwrite:
-                    doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
+                    doc.c[j].tag = self.vocab.strings[labels[tag_id]]
 
     def update(self, examples, *, drop=0., sgd=None, losses=None):
         """Learn from a batch of documents and gold-standard information,

From 7d50804644eccb3dabd421ce413d1d2f748814a5 Mon Sep 17 00:00:00 2001
From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
Date: Sun, 5 Dec 2021 03:34:48 +0800
Subject: [PATCH 092/133] Migrate regression tests into the main test suite
 (#9655)

* Migrate regressions 1-1000

* Move serialize test to correct file

* Remove tests that won't work in v3

* Migrate regressions 1000-1500

Removed regression test 1250 because v3 doesn't support the old LEX
scheme anymore.

* Add missing imports in serializer tests

* Migrate tests 1500-2000

* Migrate regressions from 2000-2500

* Migrate regressions from 2501-3000

* Migrate regressions from 3000-3501

* Migrate regressions from 3501-4000

* Migrate regressions from 4001-4500

* Migrate regressions from 4501-5000

* Migrate regressions from 5001-5501

* Migrate regressions from 5501 to 7000

* Migrate regressions from 7001 to 8000

* Migrate remaining regression tests

* Fixing missing imports

* Update docs with new system [ci skip]

* Update CONTRIBUTING.md

- Fix formatting
- Update wording

* Remove lemmatizer tests in el lang

* Move a few tests into the general tokenizer

* Separate Doc and DocBin tests
---
 CONTRIBUTING.md                               |  24 +-
 extra/DEVELOPER_DOCS/Code Conventions.md      |   2 +-
 spacy/tests/doc/test_array.py                 |  23 +
 spacy/tests/doc/test_doc_api.py               | 225 +++++++-
 spacy/tests/doc/test_retokenize_split.py      |  42 ++
 spacy/tests/doc/test_span.py                  | 102 ++++
 spacy/tests/lang/en/test_sbd.py               |   9 +
 spacy/tests/lang/en/test_tokenizer.py         | 169 ++++++
 spacy/tests/lang/es/test_text.py              |  11 +
 spacy/tests/lang/hi/test_text.py              |  11 +
 spacy/tests/lang/it/test_text.py              |  14 +
 spacy/tests/lang/ja/test_tokenizer.py         |  12 +
 spacy/tests/lang/sv/test_exceptions.py        |  21 +-
 spacy/tests/lang/test_attrs.py                |  13 +-
 spacy/tests/matcher/test_matcher_logic.py     | 479 ++++++++++++++++-
 spacy/tests/matcher/test_phrase_matcher.py    | 119 ++++-
 spacy/tests/parser/test_arc_eager_oracle.py   |  22 +
 spacy/tests/parser/test_ner.py                | 153 +++++-
 spacy/tests/parser/test_parse.py              |  98 +++-
 spacy/tests/pipeline/test_entity_linker.py    | 206 +++++++-
 spacy/tests/pipeline/test_entity_ruler.py     | 117 ++++-
 spacy/tests/pipeline/test_pipe_factories.py   |  33 ++
 spacy/tests/pipeline/test_pipe_methods.py     | 142 ++++-
 spacy/tests/pipeline/test_tagger.py           |  17 +
 spacy/tests/pipeline/test_textcat.py          | 237 ++++++++-
 spacy/tests/regression/__init__.py            |   0
 spacy/tests/regression/test_issue1-1000.py    | 486 -----------------
 spacy/tests/regression/test_issue1001-1500.py | 174 -------
 spacy/tests/regression/test_issue1501-2000.py | 375 -------------
 spacy/tests/regression/test_issue2001-2500.py | 152 ------
 spacy/tests/regression/test_issue2501-3000.py | 238 ---------
 spacy/tests/regression/test_issue3001-3500.py | 272 ----------
 spacy/tests/regression/test_issue3501-4000.py | 492 ------------------
 spacy/tests/regression/test_issue4001-4500.py | 447 ----------------
 spacy/tests/regression/test_issue4501-5000.py | 266 ----------
 spacy/tests/regression/test_issue5001-5500.py | 149 ------
 spacy/tests/regression/test_issue5501-6000.py |  95 ----
 spacy/tests/regression/test_issue6001-6500.py |  30 --
 spacy/tests/regression/test_issue6501-7000.py | 238 ---------
 spacy/tests/regression/test_issue7001-8000.py | 288 ----------
 spacy/tests/regression/test_issue7716.py      |  55 --
 spacy/tests/regression/test_issue8168.py      |  24 -
 spacy/tests/regression/test_issue8190.py      |  24 -
 spacy/tests/regression/test_issue8216.py      |  34 --
 .../tests/serialize/test_serialize_config.py  |  44 +-
 spacy/tests/serialize/test_serialize_doc.py   | 229 +++++---
 .../tests/serialize/test_serialize_docbin.py  | 106 ++++
 .../serialize/test_serialize_language.py      |  73 ++-
 .../serialize/test_serialize_pipeline.py      | 198 ++++++-
 .../serialize/test_serialize_tokenizer.py     |  88 +++-
 .../serialize/test_serialize_vocab_strings.py |  60 ++-
 spacy/tests/test_cli.py                       | 119 ++++-
 spacy/tests/test_displacy.py                  |  90 ++++
 spacy/tests/test_misc.py                      |  29 +-
 spacy/tests/tokenizer/test_tokenizer.py       | 282 +++++++++-
 spacy/tests/training/test_training.py         | 222 +++++++-
 spacy/tests/vocab_vectors/test_lexeme.py      |  20 +-
 spacy/tests/vocab_vectors/test_similarity.py  |  10 +
 spacy/tests/vocab_vectors/test_vectors.py     |  86 ++-
 spacy/tests/vocab_vectors/test_vocab_api.py   |  15 +-
 60 files changed, 3789 insertions(+), 4022 deletions(-)
 create mode 100644 spacy/tests/lang/en/test_tokenizer.py
 create mode 100644 spacy/tests/lang/hi/test_text.py
 create mode 100644 spacy/tests/lang/it/test_text.py
 delete mode 100644 spacy/tests/regression/__init__.py
 delete mode 100644 spacy/tests/regression/test_issue1-1000.py
 delete mode 100644 spacy/tests/regression/test_issue1001-1500.py
 delete mode 100644 spacy/tests/regression/test_issue1501-2000.py
 delete mode 100644 spacy/tests/regression/test_issue2001-2500.py
 delete mode 100644 spacy/tests/regression/test_issue2501-3000.py
 delete mode 100644 spacy/tests/regression/test_issue3001-3500.py
 delete mode 100644 spacy/tests/regression/test_issue3501-4000.py
 delete mode 100644 spacy/tests/regression/test_issue4001-4500.py
 delete mode 100644 spacy/tests/regression/test_issue4501-5000.py
 delete mode 100644 spacy/tests/regression/test_issue5001-5500.py
 delete mode 100644 spacy/tests/regression/test_issue5501-6000.py
 delete mode 100644 spacy/tests/regression/test_issue6001-6500.py
 delete mode 100644 spacy/tests/regression/test_issue6501-7000.py
 delete mode 100644 spacy/tests/regression/test_issue7001-8000.py
 delete mode 100644 spacy/tests/regression/test_issue7716.py
 delete mode 100644 spacy/tests/regression/test_issue8168.py
 delete mode 100644 spacy/tests/regression/test_issue8190.py
 delete mode 100644 spacy/tests/regression/test_issue8216.py
 create mode 100644 spacy/tests/serialize/test_serialize_docbin.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a4d321aa3..9a7d0744a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -143,15 +143,25 @@ Changes to `.py` files will be effective immediately.
 ### Fixing bugs
 
 When fixing a bug, first create an
-[issue](https://github.com/explosion/spaCy/issues) if one does not already exist.
-The description text can be very short – we don't want to make this too
+[issue](https://github.com/explosion/spaCy/issues) if one does not already
+exist.  The description text can be very short – we don't want to make this too
 bureaucratic.
 
-Next, create a test file named `test_issue[ISSUE NUMBER].py` in the
-[`spacy/tests/regression`](spacy/tests/regression) folder. Test for the bug
-you're fixing, and make sure the test fails. Next, add and commit your test file
-referencing the issue number in the commit message. Finally, fix the bug, make
-sure your test passes and reference the issue in your commit message.
+Next, add a test to the relevant file in the
+[`spacy/tests`](spacy/tests)folder. Then add a [pytest
+mark](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers),
+`@pytest.mark.issue(NUMBER)`, to reference the issue number.
+
+```python
+# Assume you're fixing Issue #1234
+@pytest.mark.issue(1234)
+def test_issue1234():
+    ...
+```
+
+Test for the bug you're fixing, and make sure the test fails. Next, add and
+commit your test file. Finally, fix the bug, make sure your test passes and
+reference the issue number in your pull request description.
 
 📖 **For more information on how to add tests, check out the [tests README](spacy/tests/README.md).**
 
diff --git a/extra/DEVELOPER_DOCS/Code Conventions.md b/extra/DEVELOPER_DOCS/Code Conventions.md
index 7a3f6996f..eba466c46 100644
--- a/extra/DEVELOPER_DOCS/Code Conventions.md	
+++ b/extra/DEVELOPER_DOCS/Code Conventions.md	
@@ -444,7 +444,7 @@ spaCy uses the [`pytest`](http://doc.pytest.org/) framework for testing. Tests f
 
 When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`.
 
-Regression tests are tests that refer to bugs reported in specific issues. They should live in the `regression` module and are named according to the issue number (e.g. `test_issue1234.py`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first. Every once in a while, we go through the `regression` module and group tests together into larger files by issue number, in groups of 500 to 1000 numbers. This prevents us from ending up with too many individual files over time.
+Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first. 
 
 The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
 
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index ef54c581c..c334cc6eb 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -1,8 +1,31 @@
+import numpy
 import pytest
+
 from spacy.tokens import Doc
 from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
 
 
+@pytest.mark.issue(2203)
+def test_issue2203(en_vocab):
+    """Test that lemmas are set correctly in doc.from_array."""
+    words = ["I", "'ll", "survive"]
+    tags = ["PRP", "MD", "VB"]
+    lemmas = ["-PRON-", "will", "survive"]
+    tag_ids = [en_vocab.strings.add(tag) for tag in tags]
+    lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
+    doc = Doc(en_vocab, words=words)
+    # Work around lemma corruption problem and set lemmas after tags
+    doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
+    doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
+    assert [t.tag_ for t in doc] == tags
+    assert [t.lemma_ for t in doc] == lemmas
+    # We need to serialize both tag and lemma, since this is what causes the bug
+    doc_array = doc.to_array(["TAG", "LEMMA"])
+    new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array)
+    assert [t.tag_ for t in new_doc] == tags
+    assert [t.lemma_ for t in new_doc] == lemmas
+
+
 def test_doc_array_attr_of_token(en_vocab):
     doc = Doc(en_vocab, words=["An", "example", "sentence"])
     example = doc.vocab["example"]
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 57df87642..c6195d7e2 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -1,14 +1,17 @@
 import weakref
 
-import pytest
 import numpy
+import pytest
+from thinc.api import NumpyOps, get_current_ops
 
+from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
+from spacy.attrs import SENT_START, TAG
+from spacy.lang.en import English
 from spacy.lang.xx import MultiLanguage
+from spacy.language import Language
+from spacy.lexeme import Lexeme
 from spacy.tokens import Doc, Span, Token
 from spacy.vocab import Vocab
-from spacy.lexeme import Lexeme
-from spacy.lang.en import English
-from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
 
 from .test_underscore import clean_underscore  # noqa: F401
 
@@ -30,6 +33,220 @@ def test_doc_api_init(en_vocab):
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
 
 
+@pytest.mark.issue(1547)
+def test_issue1547():
+    """Test that entity labels still match after merging tokens."""
+    words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
+    doc = Doc(Vocab(), words=words)
+    doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[5:7])
+    assert [ent.text for ent in doc.ents]
+
+
+@pytest.mark.issue(1757)
+def test_issue1757():
+    """Test comparison against None doesn't cause segfault."""
+    doc = Doc(Vocab(), words=["a", "b", "c"])
+    assert not doc[0] < None
+    assert not doc[0] is None
+    assert doc[0] >= None
+    assert not doc[:2] < None
+    assert not doc[:2] is None
+    assert doc[:2] >= None
+    assert not doc.vocab["a"] is None
+    assert not doc.vocab["a"] < None
+
+
+@pytest.mark.issue(2396)
+def test_issue2396(en_vocab):
+    words = ["She", "created", "a", "test", "for", "spacy"]
+    heads = [1, 1, 3, 1, 3, 4]
+    deps = ["dep"] * len(heads)
+    matrix = numpy.array(
+        [
+            [0, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 2, 3, 3, 3],
+            [1, 1, 3, 3, 3, 3],
+            [1, 1, 3, 3, 4, 4],
+            [1, 1, 3, 3, 4, 5],
+        ],
+        dtype=numpy.int32,
+    )
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
+    span = doc[:]
+    assert (doc.get_lca_matrix() == matrix).all()
+    assert (span.get_lca_matrix() == matrix).all()
+
+
+@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
+@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
+@pytest.mark.issue(2782)
+def test_issue2782(text, lang_cls):
+    """Check that like_num handles + and - before number."""
+    nlp = lang_cls()
+    doc = nlp(text)
+    assert len(doc) == 1
+    assert doc[0].like_num
+
+
+@pytest.mark.parametrize(
+    "sentence",
+    [
+        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
+        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
+        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
+    ],
+)
+@pytest.mark.issue(3869)
+def test_issue3869(sentence):
+    """Test that the Doc's count_by function works consistently"""
+    nlp = English()
+    doc = nlp(sentence)
+    count = 0
+    for token in doc:
+        count += token.is_alpha
+    assert count == doc.count_by(IS_ALPHA).get(1, 0)
+
+
+@pytest.mark.issue(3962)
+def test_issue3962(en_vocab):
+    """Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    # fmt: off
+    words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
+    heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7]
+    deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+    # fmt: on
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
+    span2 = doc[1:5]  # "jests at scars ,"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+    # head set to itself, being the new artificial root
+    assert doc2[0].head.text == "jests"
+    assert doc2[0].dep_ == "dep"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"  # head set to the new artificial root
+    assert doc2[3].dep_ == "dep"
+    # We should still have 1 sentence
+    assert len(list(doc2.sents)) == 1
+    span3 = doc[6:9]  # "never felt a"
+    doc3 = span3.as_doc()
+    doc3_json = doc3.to_json()
+    assert doc3_json
+    assert doc3[0].head.text == "felt"
+    assert doc3[0].dep_ == "neg"
+    assert doc3[1].head.text == "felt"
+    assert doc3[1].dep_ == "ROOT"
+    assert doc3[2].head.text == "felt"  # head set to ancestor
+    assert doc3[2].dep_ == "dep"
+    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
+    assert len(list(doc3.sents)) == 1
+
+
+@pytest.mark.issue(3962)
+def test_issue3962_long(en_vocab):
+    """Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    # fmt: off
+    words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
+    heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7]
+    deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+    # fmt: on
+    two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
+    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+    # head set to itself, being the new artificial root (in sentence 1)
+    assert doc2[0].head.text == "jests"
+    assert doc2[0].dep_ == "ROOT"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"
+    assert doc2[3].dep_ == "punct"
+    # head set to itself, being the new artificial root (in sentence 2)
+    assert doc2[4].head.text == "They"
+    assert doc2[4].dep_ == "dep"
+    # head set to the new artificial head (in sentence 2)
+    assert doc2[4].head.text == "They"
+    assert doc2[4].dep_ == "dep"
+    # We should still have 2 sentences
+    sents = list(doc2.sents)
+    assert len(sents) == 2
+    assert sents[0].text == "jests at scars ."
+    assert sents[1].text == "They never"
+
+
+@Language.factory("my_pipe")
+class CustomPipe:
+    def __init__(self, nlp, name="my_pipe"):
+        self.name = name
+        Span.set_extension("my_ext", getter=self._get_my_ext)
+        Doc.set_extension("my_ext", default=None)
+
+    def __call__(self, doc):
+        gathered_ext = []
+        for sent in doc.sents:
+            sent_ext = self._get_my_ext(sent)
+            sent._.set("my_ext", sent_ext)
+            gathered_ext.append(sent_ext)
+
+        doc._.set("my_ext", "\n".join(gathered_ext))
+        return doc
+
+    @staticmethod
+    def _get_my_ext(span):
+        return str(span.end)
+
+
+@pytest.mark.issue(4903)
+def test_issue4903():
+    """Ensure that this runs correctly and doesn't hang or crash on Windows /
+    macOS."""
+    nlp = English()
+    nlp.add_pipe("sentencizer")
+    nlp.add_pipe("my_pipe", after="sentencizer")
+    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
+    if isinstance(get_current_ops(), NumpyOps):
+        docs = list(nlp.pipe(text, n_process=2))
+        assert docs[0].text == "I like bananas."
+        assert docs[1].text == "Do you like them?"
+        assert docs[2].text == "No, I prefer wasabi."
+
+
+@pytest.mark.issue(5048)
+def test_issue5048(en_vocab):
+    words = ["This", "is", "a", "sentence"]
+    pos_s = ["DET", "VERB", "DET", "NOUN"]
+    spaces = [" ", " ", " ", ""]
+    deps_s = ["dep", "adj", "nn", "atm"]
+    tags_s = ["DT", "VBZ", "DT", "NN"]
+    strings = en_vocab.strings
+    for w in words:
+        strings.add(w)
+    deps = [strings.add(d) for d in deps_s]
+    pos = [strings.add(p) for p in pos_s]
+    tags = [strings.add(t) for t in tags_s]
+    attrs = [POS, DEP, TAG]
+    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    doc.from_array(attrs, array)
+    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
+    doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
+    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
+    assert v1 == v2
+
+
 @pytest.mark.parametrize("text", [["one", "two", "three"]])
 def test_doc_api_compare_by_string_position(en_vocab, text):
     doc = Doc(en_vocab, words=text)
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index 16df1713d..ec4deb033 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -1,8 +1,50 @@
+import numpy
 import pytest
+
 from spacy.vocab import Vocab
 from spacy.tokens import Doc, Token
 
 
+@pytest.mark.issue(3540)
+def test_issue3540(en_vocab):
+    words = ["I", "live", "in", "NewYork", "right", "now"]
+    tensor = numpy.asarray(
+        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
+        dtype="f",
+    )
+    doc = Doc(en_vocab, words=words)
+    doc.tensor = tensor
+    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
+    assert [token.text for token in doc] == gold_text
+    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
+    for i, lemma in enumerate(gold_lemma):
+        doc[i].lemma_ = lemma
+    assert [token.lemma_ for token in doc] == gold_lemma
+    vectors_1 = [token.vector for token in doc]
+    assert len(vectors_1) == len(doc)
+
+    with doc.retokenize() as retokenizer:
+        heads = [(doc[3], 1), doc[2]]
+        attrs = {
+            "POS": ["PROPN", "PROPN"],
+            "LEMMA": ["New", "York"],
+            "DEP": ["pobj", "compound"],
+        }
+        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
+
+    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
+    assert [token.text for token in doc] == gold_text
+    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
+    assert [token.lemma_ for token in doc] == gold_lemma
+    vectors_2 = [token.vector for token in doc]
+    assert len(vectors_2) == len(doc)
+    assert vectors_1[0].tolist() == vectors_2[0].tolist()
+    assert vectors_1[1].tolist() == vectors_2[1].tolist()
+    assert vectors_1[2].tolist() == vectors_2[2].tolist()
+    assert vectors_1[4].tolist() == vectors_2[5].tolist()
+    assert vectors_1[5].tolist() == vectors_2[6].tolist()
+
+
 def test_doc_retokenize_split(en_vocab):
     words = ["LosAngeles", "start", "."]
     heads = [1, 2, 2]
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 2503ad94c..d18293d3f 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -1,7 +1,9 @@
 import pytest
 import numpy
 from numpy.testing import assert_array_equal
+
 from spacy.attrs import ORTH, LENGTH
+from spacy.lang.en import English
 from spacy.tokens import Doc, Span, Token
 from spacy.vocab import Vocab
 from spacy.util import filter_spans
@@ -43,6 +45,106 @@ def doc_not_parsed(en_tokenizer):
     return doc
 
 
+@pytest.mark.issue(1537)
+def test_issue1537():
+    """Test that Span.as_doc() doesn't segfault."""
+    string = "The sky is blue . The man is pink . The dog is purple ."
+    doc = Doc(Vocab(), words=string.split())
+    doc[0].sent_start = True
+    for word in doc[1:]:
+        if word.nbor(-1).text == ".":
+            word.sent_start = True
+        else:
+            word.sent_start = False
+    sents = list(doc.sents)
+    sent0 = sents[0].as_doc()
+    sent1 = sents[1].as_doc()
+    assert isinstance(sent0, Doc)
+    assert isinstance(sent1, Doc)
+
+
+@pytest.mark.issue(1612)
+def test_issue1612(en_tokenizer):
+    """Test that span.orth_ is identical to span.text"""
+    doc = en_tokenizer("The black cat purrs.")
+    span = doc[1:3]
+    assert span.orth_ == span.text
+
+
+@pytest.mark.issue(3199)
+def test_issue3199():
+    """Test that Span.noun_chunks works correctly if no noun chunks iterator
+    is available. To make this test future-proof, we're constructing a Doc
+    with a new Vocab here and a parse tree to make sure the noun chunks run.
+    """
+    words = ["This", "is", "a", "sentence"]
+    doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
+    with pytest.raises(NotImplementedError):
+        list(doc[0:3].noun_chunks)
+
+
+@pytest.mark.issue(5152)
+def test_issue5152():
+    # Test that the comparison between a Span and a Token, goes well
+    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
+    nlp = English()
+    text = nlp("Talk about being boring!")
+    text_var = nlp("Talk of being boring!")
+    y = nlp("Let")
+    span = text[0:3]  # Talk about being
+    span_2 = text[0:3]  # Talk about being
+    span_3 = text_var[0:3]  # Talk of being
+    token = y[0]  # Let
+    with pytest.warns(UserWarning):
+        assert span.similarity(token) == 0.0
+    assert span.similarity(span_2) == 1.0
+    with pytest.warns(UserWarning):
+        assert span_2.similarity(span_3) < 1.0
+
+
+@pytest.mark.issue(6755)
+def test_issue6755(en_tokenizer):
+    doc = en_tokenizer("This is a magnificent sentence.")
+    span = doc[:0]
+    assert span.text_with_ws == ""
+    assert span.text == ""
+
+
+@pytest.mark.parametrize(
+    "sentence, start_idx,end_idx,label",
+    [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
+)
+@pytest.mark.issue(6815)
+def test_issue6815_1(sentence, start_idx, end_idx, label):
+    nlp = English()
+    doc = nlp(sentence)
+    span = doc[:].char_span(start_idx, end_idx, label=label)
+    assert span.label_ == label
+
+
+@pytest.mark.parametrize(
+    "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
+)
+@pytest.mark.issue(6815)
+def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
+    nlp = English()
+    doc = nlp(sentence)
+    span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
+    assert span.kb_id == kb_id
+
+
+@pytest.mark.parametrize(
+    "sentence, start_idx,end_idx,vector",
+    [("Welcome to Mumbai, my friend", 11, 17, numpy.array([0.1, 0.2, 0.3]))],
+)
+@pytest.mark.issue(6815)
+def test_issue6815_3(sentence, start_idx, end_idx, vector):
+    nlp = English()
+    doc = nlp(sentence)
+    span = doc[:].char_span(start_idx, end_idx, vector=vector)
+    assert (span.vector == vector).all()
+
+
 @pytest.mark.parametrize(
     "i_sent,i,j,text",
     [
diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py
index 39d8d3b59..d30c72750 100644
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@@ -4,6 +4,15 @@ from spacy.tokens import Doc
 from ...util import apply_transition_sequence
 
 
+@pytest.mark.issue(309)
+def test_issue309(en_vocab):
+    """Test Issue #309: SBD fails on empty string"""
+    doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
+    assert len(doc) == 1
+    sents = list(doc.sents)
+    assert len(sents) == 1
+
+
 @pytest.mark.parametrize("words", [["A", "test", "sentence"]])
 @pytest.mark.parametrize("punct", [".", "!", "?", ""])
 def test_en_sbd_single_punct(en_vocab, words, punct):
diff --git a/spacy/tests/lang/en/test_tokenizer.py b/spacy/tests/lang/en/test_tokenizer.py
new file mode 100644
index 000000000..e6d1d7d85
--- /dev/null
+++ b/spacy/tests/lang/en/test_tokenizer.py
@@ -0,0 +1,169 @@
+import pytest
+
+
+@pytest.mark.issue(351)
+def test_issue351(en_tokenizer):
+    doc = en_tokenizer("   This is a cat.")
+    assert doc[0].idx == 0
+    assert len(doc[0]) == 3
+    assert doc[1].idx == 3
+
+
+@pytest.mark.issue(360)
+def test_issue360(en_tokenizer):
+    """Test tokenization of big ellipsis"""
+    tokens = en_tokenizer("$45...............Asking")
+    assert len(tokens) > 2
+
+
+@pytest.mark.issue(736)
+@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
+def test_issue736(en_tokenizer, text, number):
+    """Test that times like "7am" are tokenized correctly and that numbers are
+    converted to string."""
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 2
+    assert tokens[0].text == number
+
+
+@pytest.mark.issue(740)
+@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
+def test_issue740(en_tokenizer, text):
+    """Test that dates are not split and kept as one token. This behaviour is
+    currently inconsistent, since dates separated by hyphens are still split.
+    This will be hard to prevent without causing clashes with numeric ranges."""
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 1
+
+
+@pytest.mark.issue(744)
+@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
+def test_issue744(en_tokenizer, text):
+    """Test that 'were' and 'Were' are excluded from the contractions
+    generated by the English tokenizer exceptions."""
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[1].text.lower() == "were"
+
+
+@pytest.mark.issue(759)
+@pytest.mark.parametrize(
+    "text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
+)
+def test_issue759(en_tokenizer, text, is_num):
+    tokens = en_tokenizer(text)
+    assert tokens[0].like_num == is_num
+
+
+@pytest.mark.issue(775)
+@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
+def test_issue775(en_tokenizer, text):
+    """Test that 'Shell' and 'shell' are excluded from the contractions
+    generated by the English tokenizer exceptions."""
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].text == text
+
+
+@pytest.mark.issue(792)
+@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
+def test_issue792(en_tokenizer, text):
+    """Test for Issue #792: Trailing whitespace is removed after tokenization."""
+    doc = en_tokenizer(text)
+    assert "".join([token.text_with_ws for token in doc]) == text
+
+
+@pytest.mark.issue(792)
+@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
+def test_control_issue792(en_tokenizer, text):
+    """Test base case for Issue #792: Non-trailing whitespace"""
+    doc = en_tokenizer(text)
+    assert "".join([token.text_with_ws for token in doc]) == text
+
+
+@pytest.mark.issue(859)
+@pytest.mark.parametrize(
+    "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
+)
+def test_issue859(en_tokenizer, text):
+    """Test that no extra space is added in doc.text method."""
+    doc = en_tokenizer(text)
+    assert doc.text == text
+
+
+@pytest.mark.issue(886)
+@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
+def test_issue886(en_tokenizer, text):
+    """Test that token.idx matches the original text index for texts with newlines."""
+    doc = en_tokenizer(text)
+    for token in doc:
+        assert len(token.text) == len(token.text_with_ws)
+        assert text[token.idx] == token.text[0]
+
+
+@pytest.mark.issue(891)
+@pytest.mark.parametrize("text", ["want/need"])
+def test_issue891(en_tokenizer, text):
+    """Test that / infixes are split correctly."""
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[1].text == "/"
+
+
+@pytest.mark.issue(957)
+@pytest.mark.slow
+def test_issue957(en_tokenizer):
+    """Test that spaCy doesn't hang on many punctuation characters.
+    If this test hangs, check (new) regular expressions for conflicting greedy operators
+    """
+    # Skip test if pytest-timeout is not installed
+    pytest.importorskip("pytest_timeout")
+    for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]:
+        string = "0"
+        for i in range(1, 100):
+            string += punct + str(i)
+        doc = en_tokenizer(string)
+        assert doc
+
+
+@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
+@pytest.mark.issue(1698)
+def test_issue1698(en_tokenizer, text):
+    """Test that doc doesn't identify email-addresses as URLs"""
+    doc = en_tokenizer(text)
+    assert len(doc) == 1
+    assert not doc[0].like_url
+
+
+@pytest.mark.issue(1758)
+def test_issue1758(en_tokenizer):
+    """Test that "would've" is handled by the English tokenizer exceptions."""
+    tokens = en_tokenizer("would've")
+    assert len(tokens) == 2
+
+
+@pytest.mark.issue(1773)
+def test_issue1773(en_tokenizer):
+    """Test that spaces don't receive a POS but no TAG. This is the root cause
+    of the serialization issue reported in #1773."""
+    doc = en_tokenizer("\n")
+    if doc[0].pos_ == "SPACE":
+        assert doc[0].tag_ != ""
+
+
+@pytest.mark.issue(3277)
+def test_issue3277(es_tokenizer):
+    """Test that hyphens are split correctly as prefixes."""
+    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
+    assert len(doc) == 14
+    assert doc[0].text == "\u2014"
+    assert doc[5].text == "\u2013"
+    assert doc[9].text == "\u2013"
+
+
+@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
+@pytest.mark.issue(3521)
+def test_issue3521(en_tokenizer, word):
+    tok = en_tokenizer(word)[1]
+    # 'not' and 'would' should be stopwords, also in their abbreviated forms
+    assert tok.is_stop
diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py
index 96f6bcab5..d95f6d26b 100644
--- a/spacy/tests/lang/es/test_text.py
+++ b/spacy/tests/lang/es/test_text.py
@@ -1,5 +1,16 @@
 import pytest
 from spacy.lang.es.lex_attrs import like_num
+from spacy.lang.es import Spanish
+
+
+@pytest.mark.issue(3803)
+def test_issue3803():
+    """Test that spanish num-like tokens have True for like_num attribute."""
+    nlp = Spanish()
+    text = "2 dos 1000 mil 12 doce"
+    doc = nlp(text)
+
+    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
 
 
 def test_es_tokenizer_handles_long_text(es_tokenizer):
diff --git a/spacy/tests/lang/hi/test_text.py b/spacy/tests/lang/hi/test_text.py
new file mode 100644
index 000000000..791cc3822
--- /dev/null
+++ b/spacy/tests/lang/hi/test_text.py
@@ -0,0 +1,11 @@
+import pytest
+from spacy.lang.hi import Hindi
+
+
+@pytest.mark.issue(3625)
+def test_issue3625():
+    """Test that default punctuation rules applies to hindi unicode characters"""
+    nlp = Hindi()
+    doc = nlp("hi. how हुए. होटल, होटल")
+    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
+    assert [token.text for token in doc] == expected
diff --git a/spacy/tests/lang/it/test_text.py b/spacy/tests/lang/it/test_text.py
new file mode 100644
index 000000000..6023a20b1
--- /dev/null
+++ b/spacy/tests/lang/it/test_text.py
@@ -0,0 +1,14 @@
+import pytest
+
+
+@pytest.mark.issue(2822)
+def test_issue2822(it_tokenizer):
+    """Test that the abbreviation of poco is kept as one word."""
+    doc = it_tokenizer("Vuoi un po' di zucchero?")
+    assert len(doc) == 6
+    assert doc[0].text == "Vuoi"
+    assert doc[1].text == "un"
+    assert doc[2].text == "po'"
+    assert doc[3].text == "di"
+    assert doc[4].text == "zucchero"
+    assert doc[5].text == "?"
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index 3437ea283..ef7bed06d 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -54,6 +54,18 @@ SUB_TOKEN_TESTS = [
 # fmt: on
 
 
+@pytest.mark.issue(2901)
+def test_issue2901():
+    """Test that `nlp` doesn't fail."""
+    try:
+        nlp = Japanese()
+    except ImportError:
+        pytest.skip()
+
+    doc = nlp("pythonが大好きです")
+    assert doc
+
+
 @pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
 def test_ja_tokenizer(ja_tokenizer, text, expected_tokens):
     tokens = [token.text for token in ja_tokenizer(text)]
diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py
index e6cae4d2b..b49a0c832 100644
--- a/spacy/tests/lang/sv/test_exceptions.py
+++ b/spacy/tests/lang/sv/test_exceptions.py
@@ -1,6 +1,5 @@
 import pytest
 
-
 SV_TOKEN_EXCEPTION_TESTS = [
     (
         "Smörsåsen används bl.a. till fisk",
@@ -17,6 +16,26 @@ SV_TOKEN_EXCEPTION_TESTS = [
 ]
 
 
+@pytest.mark.issue(805)
+@pytest.mark.parametrize(
+    "text,expected_tokens",
+    [
+        (
+            "Smörsåsen används bl.a. till fisk",
+            ["Smörsåsen", "används", "bl.a.", "till", "fisk"],
+        ),
+        (
+            "Jag kommer först kl. 13 p.g.a. diverse förseningar",
+            ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
+        ),
+    ],
+)
+def test_issue805(sv_tokenizer, text, expected_tokens):
+    tokens = sv_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
+
+
 @pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS)
 def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
     tokens = sv_tokenizer(text)
diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py
index 6a7a404fd..5350c1fe5 100644
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@@ -1,6 +1,15 @@
 import pytest
-from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA
-from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape
+
+from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
+from spacy.lang.en.stop_words import STOP_WORDS
+from spacy.lang.lex_attrs import is_ascii, is_currency, is_punct, is_stop
+from spacy.lang.lex_attrs import like_url, word_shape
+
+
+@pytest.mark.parametrize("word", ["the"])
+@pytest.mark.issue(1889)
+def test_issue1889(word):
+    assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
 
 
 @pytest.mark.parametrize("text", ["dog"])
diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py
index b96bb2032..3649b07ed 100644
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@@ -1,10 +1,14 @@
-import pytest
 import re
 
-from spacy.lang.en import English
-from spacy.matcher import Matcher
-from spacy.tokens import Doc, Span
+import pytest
 
+from spacy.attrs import IS_PUNCT, LOWER, ORTH
+from spacy.errors import MatchPatternError
+from spacy.lang.en import English
+from spacy.lang.lex_attrs import LEX_ATTRS
+from spacy.matcher import Matcher
+from spacy.tokens import Doc, Span, Token
+from spacy.vocab import Vocab
 
 pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}]
 pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A"}]
@@ -36,6 +40,473 @@ def doc(en_tokenizer, text):
     return doc
 
 
+@pytest.mark.issue(118)
+@pytest.mark.parametrize(
+    "patterns",
+    [
+        [[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
+        [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]],
+    ],
+)
+def test_issue118(en_tokenizer, patterns):
+    """Test a bug that arose from having overlapping matches"""
+    text = (
+        "how many points did lebron james score against the boston celtics last night"
+    )
+    doc = en_tokenizer(text)
+    ORG = doc.vocab.strings["ORG"]
+    matcher = Matcher(doc.vocab)
+    matcher.add("BostonCeltics", patterns)
+    assert len(list(doc.ents)) == 0
+    matches = [(ORG, start, end) for _, start, end in matcher(doc)]
+    assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
+    doc.ents = matches[:1]
+    ents = list(doc.ents)
+    assert len(ents) == 1
+    assert ents[0].label == ORG
+    assert ents[0].start == 9
+    assert ents[0].end == 11
+
+
+@pytest.mark.issue(118)
+@pytest.mark.parametrize(
+    "patterns",
+    [
+        [[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
+        [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]],
+    ],
+)
+def test_issue118_prefix_reorder(en_tokenizer, patterns):
+    """Test a bug that arose from having overlapping matches"""
+    text = (
+        "how many points did lebron james score against the boston celtics last night"
+    )
+    doc = en_tokenizer(text)
+    ORG = doc.vocab.strings["ORG"]
+    matcher = Matcher(doc.vocab)
+    matcher.add("BostonCeltics", patterns)
+    assert len(list(doc.ents)) == 0
+    matches = [(ORG, start, end) for _, start, end in matcher(doc)]
+    doc.ents += tuple(matches)[1:]
+    assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
+    ents = doc.ents
+    assert len(ents) == 1
+    assert ents[0].label == ORG
+    assert ents[0].start == 9
+    assert ents[0].end == 11
+
+
+@pytest.mark.issue(242)
+def test_issue242(en_tokenizer):
+    """Test overlapping multi-word phrases."""
+    text = "There are different food safety standards in different countries."
+    patterns = [
+        [{"LOWER": "food"}, {"LOWER": "safety"}],
+        [{"LOWER": "safety"}, {"LOWER": "standards"}],
+    ]
+    doc = en_tokenizer(text)
+    matcher = Matcher(doc.vocab)
+    matcher.add("FOOD", patterns)
+    matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
+    match1, match2 = matches
+    assert match1[1] == 3
+    assert match1[2] == 5
+    assert match2[1] == 4
+    assert match2[2] == 6
+    with pytest.raises(ValueError):
+        # One token can only be part of one entity, so test that the matches
+        # can't be added as entities
+        doc.ents += tuple(matches)
+
+
+@pytest.mark.issue(587)
+def test_issue587(en_tokenizer):
+    """Test that Matcher doesn't segfault on particular input"""
+    doc = en_tokenizer("a b; c")
+    matcher = Matcher(doc.vocab)
+    matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]])
+    matches = matcher(doc)
+    assert len(matches) == 1
+    matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]])
+    matches = matcher(doc)
+    assert len(matches) == 2
+    matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]])
+    matches = matcher(doc)
+    assert len(matches) == 2
+
+
+@pytest.mark.issue(588)
+def test_issue588(en_vocab):
+    """Test if empty specs still cause an error when adding patterns"""
+    matcher = Matcher(en_vocab)
+    with pytest.raises(ValueError):
+        matcher.add("TEST", [[]])
+
+
+@pytest.mark.issue(590)
+def test_issue590(en_vocab):
+    """Test overlapping matches"""
+    doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
+    matcher = Matcher(en_vocab)
+    matcher.add(
+        "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]]
+    )
+    matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]])
+    matches = matcher(doc)
+    assert len(matches) == 2
+
+
+@pytest.mark.issue(615)
+def test_issue615(en_tokenizer):
+    def merge_phrases(matcher, doc, i, matches):
+        """Merge a phrase. We have to be careful here because we'll change the
+        token indices. To avoid problems, merge all the phrases once we're called
+        on the last match."""
+        if i != len(matches) - 1:
+            return None
+        spans = [Span(doc, start, end, label=label) for label, start, end in matches]
+        with doc.retokenize() as retokenizer:
+            for span in spans:
+                tag = "NNP" if span.label_ else span.root.tag_
+                attrs = {"tag": tag, "lemma": span.text}
+                retokenizer.merge(span, attrs=attrs)
+                doc.ents = doc.ents + (span,)
+
+    text = "The golf club is broken"
+    pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
+    label = "Sport_Equipment"
+    doc = en_tokenizer(text)
+    matcher = Matcher(doc.vocab)
+    matcher.add(label, [pattern], on_match=merge_phrases)
+    matcher(doc)
+    entities = list(doc.ents)
+    assert entities != []
+    assert entities[0].label != 0
+
+
+@pytest.mark.issue(850)
+def test_issue850():
+    """The variable-length pattern matches the succeeding token. Check we
+    handle the ambiguity correctly."""
+    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
+    matcher = Matcher(vocab)
+    pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}]
+    matcher.add("FarAway", [pattern])
+    doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
+    match = matcher(doc)
+    assert len(match) == 1
+    ent_id, start, end = match[0]
+    assert start == 0
+    assert end == 4
+
+
+@pytest.mark.issue(850)
+def test_issue850_basic():
+    """Test Matcher matches with '*' operator and Boolean flag"""
+    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
+    matcher = Matcher(vocab)
+    pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
+    matcher.add("FarAway", [pattern])
+    doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
+    match = matcher(doc)
+    assert len(match) == 1
+    ent_id, start, end = match[0]
+    assert start == 0
+    assert end == 4
+
+
+@pytest.mark.issue(1434)
+def test_issue1434():
+    """Test matches occur when optional element at end of short doc."""
+    pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
+    vocab = Vocab(lex_attr_getters=LEX_ATTRS)
+    hello_world = Doc(vocab, words=["Hello", "World"])
+    hello = Doc(vocab, words=["Hello"])
+    matcher = Matcher(vocab)
+    matcher.add("MyMatcher", [pattern])
+    matches = matcher(hello_world)
+    assert matches
+    matches = matcher(hello)
+    assert matches
+
+
+@pytest.mark.parametrize(
+    "string,start,end",
+    [
+        ("a", 0, 1),
+        ("a b", 0, 2),
+        ("a c", 0, 1),
+        ("a b c", 0, 2),
+        ("a b b c", 0, 3),
+        ("a b b", 0, 3),
+    ],
+)
+@pytest.mark.issue(1450)
+def test_issue1450(string, start, end):
+    """Test matcher works when patterns end with * operator."""
+    pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
+    matcher = Matcher(Vocab())
+    matcher.add("TSTEND", [pattern])
+    doc = Doc(Vocab(), words=string.split())
+    matches = matcher(doc)
+    if start is None or end is None:
+        assert matches == []
+    assert matches[-1][1] == start
+    assert matches[-1][2] == end
+
+
+@pytest.mark.issue(1945)
+def test_issue1945():
+    """Test regression in Matcher introduced in v2.0.6."""
+    matcher = Matcher(Vocab())
+    matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]])
+    doc = Doc(matcher.vocab, words=["a", "a", "a"])
+    matches = matcher(doc)  # we should see two overlapping matches here
+    assert len(matches) == 2
+    assert matches[0][1:] == (0, 2)
+    assert matches[1][1:] == (1, 3)
+
+
+@pytest.mark.issue(1971)
+def test_issue1971(en_vocab):
+    # Possibly related to #2675 and #2671?
+    matcher = Matcher(en_vocab)
+    pattern = [
+        {"ORTH": "Doe"},
+        {"ORTH": "!", "OP": "?"},
+        {"_": {"optional": True}, "OP": "?"},
+        {"ORTH": "!", "OP": "?"},
+    ]
+    Token.set_extension("optional", default=False)
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
+    # We could also assert length 1 here, but this is more conclusive, because
+    # the real problem here is that it returns a duplicate match for a match_id
+    # that's not actually in the vocab!
+    matches = matcher(doc)
+    assert all([match_id in en_vocab.strings for match_id, start, end in matches])
+
+
+@pytest.mark.issue(1971)
+def test_issue_1971_2(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
+    pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}]
+    doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
+    matcher.add("TEST1", [pattern1, pattern2])
+    matches = matcher(doc)
+    assert len(matches) == 2
+
+
+@pytest.mark.issue(1971)
+def test_issue_1971_3(en_vocab):
+    """Test that pattern matches correctly for multiple extension attributes."""
+    Token.set_extension("a", default=1, force=True)
+    Token.set_extension("b", default=2, force=True)
+    doc = Doc(en_vocab, words=["hello", "world"])
+    matcher = Matcher(en_vocab)
+    matcher.add("A", [[{"_": {"a": 1}}]])
+    matcher.add("B", [[{"_": {"b": 2}}]])
+    matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
+    assert len(matches) == 4
+    assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
+
+
+@pytest.mark.issue(1971)
+def test_issue_1971_4(en_vocab):
+    """Test that pattern matches correctly with multiple extension attribute
+    values on a single token.
+    """
+    Token.set_extension("ext_a", default="str_a", force=True)
+    Token.set_extension("ext_b", default="str_b", force=True)
+    matcher = Matcher(en_vocab)
+    doc = Doc(en_vocab, words=["this", "is", "text"])
+    pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
+    matcher.add("TEST", [pattern])
+    matches = matcher(doc)
+    # Uncommenting this caused a segmentation fault
+    assert len(matches) == 1
+    assert matches[0] == (en_vocab.strings["TEST"], 0, 3)
+
+
+@pytest.mark.issue(2464)
+def test_issue2464(en_vocab):
+    """Test problem with successive ?. This is the same bug, so putting it here."""
+    matcher = Matcher(en_vocab)
+    doc = Doc(en_vocab, words=["a", "b"])
+    matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]])
+    matches = matcher(doc)
+    assert len(matches) == 3
+
+
+@pytest.mark.issue(2569)
+def test_issue2569(en_tokenizer):
+    """Test that operator + is greedy."""
+    doc = en_tokenizer("It is May 15, 1993.")
+    doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
+    matcher = Matcher(doc.vocab)
+    matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]])
+    matched = [doc[start:end] for _, start, end in matcher(doc)]
+    matched = sorted(matched, key=len, reverse=True)
+    assert len(matched) == 10
+    assert len(matched[0]) == 4
+    assert matched[0].text == "May 15, 1993"
+
+
+@pytest.mark.issue(2671)
+def test_issue2671():
+    """Ensure the correct entity ID is returned for matches with quantifiers.
+    See also #2675
+    """
+    nlp = English()
+    matcher = Matcher(nlp.vocab)
+    pattern_id = "test_pattern"
+    pattern = [
+        {"LOWER": "high"},
+        {"IS_PUNCT": True, "OP": "?"},
+        {"LOWER": "adrenaline"},
+    ]
+    matcher.add(pattern_id, [pattern])
+    doc1 = nlp("This is a high-adrenaline situation.")
+    doc2 = nlp("This is a high adrenaline situation.")
+    matches1 = matcher(doc1)
+    for match_id, start, end in matches1:
+        assert nlp.vocab.strings[match_id] == pattern_id
+    matches2 = matcher(doc2)
+    for match_id, start, end in matches2:
+        assert nlp.vocab.strings[match_id] == pattern_id
+
+
+@pytest.mark.issue(3009)
+def test_issue3009(en_vocab):
+    """Test problem with matcher quantifiers"""
+    patterns = [
+        [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
+        [
+            {"ORTH": "has"},
+            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
+            {"LOWER": "to"},
+            {"LOWER": "do"},
+            {"TAG": "IN"},
+        ],
+        [
+            {"ORTH": "has"},
+            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
+            {"LOWER": "to"},
+            {"LOWER": "do"},
+            {"TAG": "IN"},
+        ],
+    ]
+    words = ["also", "has", "to", "do", "with"]
+    tags = ["RB", "VBZ", "TO", "VB", "IN"]
+    pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos)
+    matcher = Matcher(en_vocab)
+    for i, pattern in enumerate(patterns):
+        matcher.add(str(i), [pattern])
+        matches = matcher(doc)
+        assert matches
+
+
+@pytest.mark.issue(3328)
+def test_issue3328(en_vocab):
+    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
+    matcher = Matcher(en_vocab)
+    patterns = [
+        [{"LOWER": {"IN": ["hello", "how"]}}],
+        [{"LOWER": {"IN": ["you", "doing"]}}],
+    ]
+    matcher.add("TEST", patterns)
+    matches = matcher(doc)
+    assert len(matches) == 4
+    matched_texts = [doc[start:end].text for _, start, end in matches]
+    assert matched_texts == ["Hello", "how", "you", "doing"]
+
+
+@pytest.mark.issue(3549)
+def test_issue3549(en_vocab):
+    """Test that match pattern validation doesn't raise on empty errors."""
+    matcher = Matcher(en_vocab, validate=True)
+    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
+    matcher.add("GOOD", [pattern])
+    with pytest.raises(MatchPatternError):
+        matcher.add("BAD", [[{"X": "Y"}]])
+
+
+@pytest.mark.skip("Matching currently only works on strings and integers")
+@pytest.mark.issue(3555)
+def test_issue3555(en_vocab):
+    """Test that custom extensions with default None don't break matcher."""
+    Token.set_extension("issue3555", default=None)
+    matcher = Matcher(en_vocab)
+    pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["have", "apple"])
+    matcher(doc)
+
+
+@pytest.mark.issue(3839)
+def test_issue3839(en_vocab):
+    """Test that match IDs returned by the matcher are correct, are in the string"""
+    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
+    matcher = Matcher(en_vocab)
+    match_id = "PATTERN"
+    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
+    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
+    matcher.add(match_id, [pattern1])
+    matches = matcher(doc)
+    assert matches[0][0] == en_vocab.strings[match_id]
+    matcher = Matcher(en_vocab)
+    matcher.add(match_id, [pattern2])
+    matches = matcher(doc)
+    assert matches[0][0] == en_vocab.strings[match_id]
+
+
+@pytest.mark.issue(3879)
+def test_issue3879(en_vocab):
+    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
+    assert len(doc) == 5
+    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [pattern])
+    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
+
+
+@pytest.mark.issue(3951)
+def test_issue3951(en_vocab):
+    """Test that combinations of optional rules are matched correctly."""
+    matcher = Matcher(en_vocab)
+    pattern = [
+        {"LOWER": "hello"},
+        {"LOWER": "this", "OP": "?"},
+        {"OP": "?"},
+        {"LOWER": "world"},
+    ]
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
+    matches = matcher(doc)
+    assert len(matches) == 0
+
+
+@pytest.mark.issue(4120)
+def test_issue4120(en_vocab):
+    """Test that matches without a final {OP: ?} token are returned."""
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
+    doc1 = Doc(en_vocab, words=["a"])
+    assert len(matcher(doc1)) == 1  # works
+    doc2 = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc2)) == 2  # fixed
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
+    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
+    assert len(matcher(doc3)) == 2  # works
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
+    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
+    assert len(matcher(doc4)) == 3  # fixed
+
+
 @pytest.mark.parametrize(
     "pattern,re_pattern",
     [
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 478949601..f893d81f8 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -1,8 +1,125 @@
 import pytest
 import srsly
 from mock import Mock
-from spacy.matcher import PhraseMatcher
+
+from spacy.lang.en import English
+from spacy.matcher import PhraseMatcher, Matcher
 from spacy.tokens import Doc, Span
+from spacy.vocab import Vocab
+
+
+from ..util import make_tempdir
+
+
+@pytest.mark.issue(3248)
+def test_issue3248_1():
+    """Test that the PhraseMatcher correctly reports its number of rules, not
+    total number of patterns."""
+    nlp = English()
+    matcher = PhraseMatcher(nlp.vocab)
+    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
+    matcher.add("TEST2", [nlp("d")])
+    assert len(matcher) == 2
+
+
+@pytest.mark.issue(3331)
+def test_issue3331(en_vocab):
+    """Test that duplicate patterns for different rules result in multiple
+    matches, one per rule.
+    """
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
+    matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
+    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
+    matches = matcher(doc)
+    assert len(matches) == 2
+    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
+    assert sorted(match_ids) == ["A", "B"]
+
+
+@pytest.mark.issue(3972)
+def test_issue3972(en_vocab):
+    """Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
+    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
+    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
+    matches = matcher(doc)
+
+    assert len(matches) == 2
+
+    # We should have a match for each of the two rules
+    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
+    assert "A" in found_ids
+    assert "B" in found_ids
+
+
+@pytest.mark.issue(4002)
+def test_issue4002(en_vocab):
+    """Test that the PhraseMatcher can match on overwritten NORM attributes."""
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern1 = Doc(en_vocab, words=["c", "d"])
+    assert [t.norm_ for t in pattern1] == ["c", "d"]
+    matcher.add("TEST", [pattern1])
+    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
+    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
+    matches = matcher(doc)
+    assert len(matches) == 1
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern2 = Doc(en_vocab, words=["1", "2"])
+    pattern2[0].norm_ = "c"
+    pattern2[1].norm_ = "d"
+    assert [t.norm_ for t in pattern2] == ["c", "d"]
+    matcher.add("TEST", [pattern2])
+    matches = matcher(doc)
+    assert len(matches) == 1
+
+
+@pytest.mark.issue(4373)
+def test_issue4373():
+    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
+    matcher = Matcher(Vocab())
+    assert isinstance(matcher.vocab, Vocab)
+    matcher = PhraseMatcher(Vocab())
+    assert isinstance(matcher.vocab, Vocab)
+
+
+@pytest.mark.issue(4651)
+def test_issue4651_with_phrase_matcher_attr():
+    """Test that the EntityRuler PhraseMatcher is deserialized correctly using
+    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    specified.
+    """
+    text = "Spacy is a python library for nlp"
+    nlp = English()
+    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
+    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+    ruler.add_patterns(patterns)
+    doc = nlp(text)
+    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
+    nlp_reloaded = English()
+    with make_tempdir() as d:
+        file_path = d / "entityruler"
+        ruler.to_disk(file_path)
+        nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
+    doc_reloaded = nlp_reloaded(text)
+    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
+    assert res == res_reloaded
+
+
+@pytest.mark.issue(6839)
+def test_issue6839(en_vocab):
+    """Ensure that PhraseMatcher accepts Span as input"""
+    # fmt: off
+    words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
+    span = doc[:8]
+    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("SPACY", [pattern])
+    matches = matcher(span)
+    assert matches
 
 
 def test_matcher_phrase_matcher(en_vocab):
diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index cba6fa81e..bb226f9c5 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -40,6 +40,28 @@ def arc_eager(vocab):
     return moves
 
 
+@pytest.mark.issue(7056)
+def test_issue7056():
+    """Test that the Unshift transition works properly, and doesn't cause
+    sentence segmentation errors."""
+    vocab = Vocab()
+    ae = ArcEager(
+        vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])
+    )
+    doc = Doc(vocab, words="Severe pain , after trauma".split())
+    state = ae.init_batch([doc])[0]
+    ae.apply_transition(state, "S")
+    ae.apply_transition(state, "L-amod")
+    ae.apply_transition(state, "S")
+    ae.apply_transition(state, "S")
+    ae.apply_transition(state, "S")
+    ae.apply_transition(state, "R-pobj")
+    ae.apply_transition(state, "D")
+    ae.apply_transition(state, "D")
+    ae.apply_transition(state, "D")
+    assert not state.eol()
+
+
 def test_oracle_four_words(arc_eager, vocab):
     words = ["a", "b", "c", "d"]
     heads = [1, 1, 3, 3]
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 21094bcb1..b3b29d1f9 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -1,13 +1,16 @@
+import random
+
 import pytest
 from numpy.testing import assert_equal
-from spacy.attrs import ENT_IOB
 
+from spacy.attrs import ENT_IOB
 from spacy import util, registry
 from spacy.lang.en import English
+from spacy.lang.it import Italian
 from spacy.language import Language
 from spacy.lookups import Lookups
 from spacy.pipeline._parser_internals.ner import BiluoPushDown
-from spacy.training import Example
+from spacy.training import Example, iob_to_biluo
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 import logging
@@ -58,6 +61,152 @@ def tsys(vocab, entity_types):
     return BiluoPushDown(vocab.strings, actions)
 
 
+@pytest.mark.parametrize("label", ["U-JOB-NAME"])
+@pytest.mark.issue(1967)
+def test_issue1967(label):
+    nlp = Language()
+    config = {}
+    ner = nlp.create_pipe("ner", config=config)
+    example = Example.from_dict(
+        Doc(ner.vocab, words=["word"]),
+        {
+            "ids": [0],
+            "words": ["word"],
+            "tags": ["tag"],
+            "heads": [0],
+            "deps": ["dep"],
+            "entities": [label],
+        },
+    )
+    assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
+
+
+@pytest.mark.issue(2179)
+def test_issue2179():
+    """Test that spurious 'extra_labels' aren't created when initializing NER."""
+    nlp = Italian()
+    ner = nlp.add_pipe("ner")
+    ner.add_label("CITIZENSHIP")
+    nlp.initialize()
+    nlp2 = Italian()
+    nlp2.add_pipe("ner")
+    assert len(nlp2.get_pipe("ner").labels) == 0
+    model = nlp2.get_pipe("ner").model
+    model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves)
+    nlp2.from_bytes(nlp.to_bytes())
+    assert "extra_labels" not in nlp2.get_pipe("ner").cfg
+    assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
+
+
+@pytest.mark.issue(2385)
+def test_issue2385():
+    """Test that IOB tags are correctly converted to BILUO tags."""
+    # fix bug in labels with a 'b' character
+    tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER")
+    assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"]
+    # maintain support for iob1 format
+    tags2 = ("I-ORG", "I-ORG", "B-ORG")
+    assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"]
+    # maintain support for iob2 format
+    tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
+    assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
+
+
+@pytest.mark.issue(2800)
+def test_issue2800():
+    """Test issue that arises when too many labels are added to NER model.
+    Used to cause segfault.
+    """
+    nlp = English()
+    train_data = []
+    train_data.extend(
+        [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
+    )
+    entity_types = [str(i) for i in range(1000)]
+    ner = nlp.add_pipe("ner")
+    for entity_type in list(entity_types):
+        ner.add_label(entity_type)
+    optimizer = nlp.initialize()
+    for i in range(20):
+        losses = {}
+        random.shuffle(train_data)
+        for example in train_data:
+            nlp.update([example], sgd=optimizer, losses=losses, drop=0.5)
+
+
+@pytest.mark.issue(3209)
+def test_issue3209():
+    """Test issue that occurred in spaCy nightly where NER labels were being
+    mapped to classes incorrectly after loading the model, when the labels
+    were added using ner.add_label().
+    """
+    nlp = English()
+    ner = nlp.add_pipe("ner")
+    ner.add_label("ANIMAL")
+    nlp.initialize()
+    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
+    assert ner.move_names == move_names
+    nlp2 = English()
+    ner2 = nlp2.add_pipe("ner")
+    model = ner2.model
+    model.attrs["resize_output"](model, ner.moves.n_moves)
+    nlp2.from_bytes(nlp.to_bytes())
+    assert ner2.move_names == move_names
+
+
+@pytest.mark.issue(4267)
+def test_issue4267():
+    """Test that running an entity_ruler after ner gives consistent results"""
+    nlp = English()
+    ner = nlp.add_pipe("ner")
+    ner.add_label("PEOPLE")
+    nlp.initialize()
+    assert "ner" in nlp.pipe_names
+    # assert that we have correct IOB annotations
+    doc1 = nlp("hi")
+    assert doc1.has_annotation("ENT_IOB")
+    for token in doc1:
+        assert token.ent_iob == 2
+    # add entity ruler and run again
+    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
+    assert "entity_ruler" in nlp.pipe_names
+    assert "ner" in nlp.pipe_names
+    # assert that we still have correct IOB annotations
+    doc2 = nlp("hi")
+    assert doc2.has_annotation("ENT_IOB")
+    for token in doc2:
+        assert token.ent_iob == 2
+
+
+@pytest.mark.issue(4313)
+def test_issue4313():
+    """This should not crash or exit with some strange error code"""
+    beam_width = 16
+    beam_density = 0.0001
+    nlp = English()
+    config = {
+        "beam_width": beam_width,
+        "beam_density": beam_density,
+    }
+    ner = nlp.add_pipe("beam_ner", config=config)
+    ner.add_label("SOME_LABEL")
+    nlp.initialize()
+    # add a new label to the doc
+    doc = nlp("What do you think about Apple ?")
+    assert len(ner.labels) == 1
+    assert "SOME_LABEL" in ner.labels
+    apple_ent = Span(doc, 5, 6, label="MY_ORG")
+    doc.ents = list(doc.ents) + [apple_ent]
+
+    # ensure the beam_parse still works with the new label
+    docs = [doc]
+    ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density)
+    assert len(ner.labels) == 2
+    assert "MY_ORG" in ner.labels
+
+
 def test_get_oracle_moves(tsys, doc, entity_annots):
     example = Example.from_dict(doc, {"entities": entity_annots})
     act_classes = tsys.get_oracle_sequence(example, _debug=False)
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index b7575d063..7bbb30d8e 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,15 +1,17 @@
 import pytest
 from numpy.testing import assert_equal
-from spacy.attrs import DEP
+from thinc.api import Adam
 
+from spacy import registry, util
+from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.training import Example
 from spacy.tokens import Doc
-from spacy import util, registry
+from spacy.training import Example
+from spacy.vocab import Vocab
 
-from ..util import apply_transition_sequence, make_tempdir
 from ...pipeline import DependencyParser
 from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from ..util import apply_transition_sequence, make_tempdir
 
 TRAIN_DATA = [
     (
@@ -59,6 +61,94 @@ PARTIAL_DATA = [
 eps = 0.1
 
 
+@pytest.fixture
+def vocab():
+    return Vocab(lex_attr_getters={NORM: lambda s: s})
+
+
+@pytest.fixture
+def parser(vocab):
+    vocab.strings.add("ROOT")
+    cfg = {"model": DEFAULT_PARSER_MODEL}
+    model = registry.resolve(cfg, validate=True)["model"]
+    parser = DependencyParser(vocab, model)
+    parser.cfg["token_vector_width"] = 4
+    parser.cfg["hidden_width"] = 32
+    # parser.add_label('right')
+    parser.add_label("left")
+    parser.initialize(lambda: [_parser_example(parser)])
+    sgd = Adam(0.001)
+
+    for i in range(10):
+        losses = {}
+        doc = Doc(vocab, words=["a", "b", "c", "d"])
+        example = Example.from_dict(
+            doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
+        )
+        parser.update([example], sgd=sgd, losses=losses)
+    return parser
+
+
+def _parser_example(parser):
+    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
+    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
+    return Example.from_dict(doc, gold)
+
+
+@pytest.mark.issue(2772)
+def test_issue2772(en_vocab):
+    """Test that deprojectivization doesn't mess up sentence boundaries."""
+    # fmt: off
+    words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."]
+    # fmt: on
+    # A tree with a non-projective (i.e. crossing) arc
+    # The arcs (0, 4) and (2, 9) cross.
+    heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9]
+    deps = ["dep"] * len(heads)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
+    assert doc[1].is_sent_start is False
+
+
+@pytest.mark.issue(3830)
+def test_issue3830_no_subtok():
+    """Test that the parser doesn't have subtok label if not learn_tokens"""
+    config = {
+        "learn_tokens": False,
+    }
+    model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
+    parser = DependencyParser(Vocab(), model, **config)
+    parser.add_label("nsubj")
+    assert "subtok" not in parser.labels
+    parser.initialize(lambda: [_parser_example(parser)])
+    assert "subtok" not in parser.labels
+
+
+@pytest.mark.issue(3830)
+def test_issue3830_with_subtok():
+    """Test that the parser does have subtok label if learn_tokens=True."""
+    config = {
+        "learn_tokens": True,
+    }
+    model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
+    parser = DependencyParser(Vocab(), model, **config)
+    parser.add_label("nsubj")
+    assert "subtok" not in parser.labels
+    parser.initialize(lambda: [_parser_example(parser)])
+    assert "subtok" in parser.labels
+
+
+@pytest.mark.issue(7716)
+@pytest.mark.xfail(reason="Not fixed yet")
+def test_partial_annotation(parser):
+    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
+    doc[2].is_sent_start = False
+    # Note that if the following line is used, then doc[2].is_sent_start == False
+    # doc[3].is_sent_start = False
+
+    doc = parser(doc)
+    assert doc[2].is_sent_start == False
+
+
 def test_parser_root(en_vocab):
     words = ["i", "do", "n't", "have", "other", "assistance"]
     heads = [3, 3, 3, 3, 5, 3]
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index a98d01964..3740e430e 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,18 +1,20 @@
 from typing import Callable, Iterable
+
 import pytest
 from numpy.testing import assert_equal
+
+from spacy import registry, util
 from spacy.attrs import ENT_KB_ID
 from spacy.compat import pickle
-from spacy.kb import KnowledgeBase, get_candidates, Candidate
-from spacy.vocab import Vocab
-
-from spacy import util, registry
+from spacy.kb import Candidate, KnowledgeBase, get_candidates
+from spacy.lang.en import English
 from spacy.ml import load_kb
 from spacy.scorer import Scorer
-from spacy.training import Example
-from spacy.lang.en import English
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Span
+from spacy.training import Example
+from spacy.util import ensure_path
+from spacy.vocab import Vocab
 
 
 @pytest.fixture
@@ -25,6 +27,198 @@ def assert_almost_equal(a, b):
     assert a - delta <= b <= a + delta
 
 
+@pytest.mark.issue(4674)
+def test_issue4674():
+    """Test that setting entities with overlapping identifiers does not mess up IO"""
+    nlp = English()
+    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    vector1 = [0.9, 1.1, 1.01]
+    vector2 = [1.8, 2.25, 2.01]
+    with pytest.warns(UserWarning):
+        kb.set_entities(
+            entity_list=["Q1", "Q1"],
+            freq_list=[32, 111],
+            vector_list=[vector1, vector2],
+        )
+    assert kb.get_size_entities() == 1
+    # dumping to file & loading back in
+    with make_tempdir() as d:
+        dir_path = ensure_path(d)
+        if not dir_path.exists():
+            dir_path.mkdir()
+        file_path = dir_path / "kb"
+        kb.to_disk(str(file_path))
+        kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+        kb2.from_disk(str(file_path))
+    assert kb2.get_size_entities() == 1
+
+
+@pytest.mark.issue(6730)
+def test_issue6730(en_vocab):
+    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
+    from spacy.kb import KnowledgeBase
+
+    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
+    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
+
+    with pytest.raises(ValueError):
+        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
+    assert kb.contains_alias("") is False
+
+    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
+    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
+
+    with make_tempdir() as tmp_dir:
+        kb.to_disk(tmp_dir)
+        kb.from_disk(tmp_dir)
+    assert kb.get_size_aliases() == 2
+    assert set(kb.get_alias_strings()) == {"x", "y"}
+
+
+@pytest.mark.issue(7065)
+def test_issue7065():
+    text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."
+    nlp = English()
+    nlp.add_pipe("sentencizer")
+    ruler = nlp.add_pipe("entity_ruler")
+    patterns = [
+        {
+            "label": "THING",
+            "pattern": [
+                {"LOWER": "symphony"},
+                {"LOWER": "no"},
+                {"LOWER": "."},
+                {"LOWER": "8"},
+            ],
+        }
+    ]
+    ruler.add_patterns(patterns)
+
+    doc = nlp(text)
+    sentences = [s for s in doc.sents]
+    assert len(sentences) == 2
+    sent0 = sentences[0]
+    ent = doc.ents[0]
+    assert ent.start < sent0.end < ent.end
+    assert sentences.index(ent.sent) == 0
+
+
+@pytest.mark.issue(7065)
+def test_issue7065_b():
+    # Test that the NEL doesn't crash when an entity crosses a sentence boundary
+    nlp = English()
+    vector_length = 3
+    nlp.add_pipe("sentencizer")
+    text = "Mahler 's Symphony No. 8 was beautiful."
+    entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
+    links = {
+        (0, 6): {"Q7304": 1.0, "Q270853": 0.0},
+        (10, 24): {"Q7304": 0.0, "Q270853": 1.0},
+    }
+    sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
+    doc = nlp(text)
+    example = Example.from_dict(
+        doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
+    )
+    train_examples = [example]
+
+    def create_kb(vocab):
+        # create artificial KB
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="No. 8",
+            entities=["Q270853"],
+            probabilities=[1.0],
+        )
+        mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_alias(
+            alias="Mahler",
+            entities=["Q7304"],
+            probabilities=[1.0],
+        )
+        return mykb
+
+    # Create the Entity Linker component and add it to the pipeline
+    entity_linker = nlp.add_pipe("entity_linker", last=True)
+    entity_linker.set_kb(create_kb)
+    # train the NEL pipe
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(2):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+    # Add a custom rule-based component to mimick NER
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
+        {
+            "label": "WORK",
+            "pattern": [
+                {"LOWER": "symphony"},
+                {"LOWER": "no"},
+                {"LOWER": "."},
+                {"LOWER": "8"},
+            ],
+        },
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+    # test the trained model - this should not throw E148
+    doc = nlp(text)
+    assert doc
+
+
+def test_partial_links():
+    # Test that having some entities on the doc without gold links, doesn't crash
+    TRAIN_DATA = [
+        (
+            "Russ Cochran his reprints include EC Comics.",
+            {
+                "links": {(0, 12): {"Q2146908": 1.0}},
+                "entities": [(0, 12, "PERSON")],
+                "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0],
+            },
+        )
+    ]
+    nlp = English()
+    vector_length = 3
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))
+
+    def create_kb(vocab):
+        # create artificial KB
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
+        return mykb
+
+    # Create and train the Entity Linker
+    entity_linker = nlp.add_pipe("entity_linker", last=True)
+    entity_linker.set_kb(create_kb)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(2):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+    # adding additional components that are required for the entity_linker
+    nlp.add_pipe("sentencizer", first=True)
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
+        {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
+    # this will run the pipeline on the examples and shouldn't crash
+    results = nlp.evaluate(train_examples)
+    assert "PERSON" in results["ents_per_type"]
+    assert "PERSON" in results["nel_f_per_type"]
+    assert "ORG" in results["ents_per_type"]
+    assert "ORG" not in results["nel_f_per_type"]
+
+
 def test_kb_valid_entities(nlp):
     """Test the valid construction of a KB with 3 entities and two aliases"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index e66b49518..0cecafff3 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -1,9 +1,11 @@
 import pytest
 
 from spacy import registry
-from spacy.tokens import Span
+from spacy.tokens import Doc, Span
 from spacy.language import Language
-from spacy.pipeline import EntityRuler
+from spacy.lang.en import English
+from spacy.pipeline import EntityRuler, EntityRecognizer, merge_entities
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.tests.util import make_tempdir
 
@@ -34,6 +36,117 @@ def add_ent_component(doc):
     return doc
 
 
+@pytest.mark.issue(3345)
+def test_issue3345():
+    """Test case where preset entity crosses sentence boundary."""
+    nlp = English()
+    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
+    doc[4].is_sent_start = True
+    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
+    cfg = {"model": DEFAULT_NER_MODEL}
+    model = registry.resolve(cfg, validate=True)["model"]
+    ner = EntityRecognizer(doc.vocab, model)
+    # Add the OUT action. I wouldn't have thought this would be necessary...
+    ner.moves.add_action(5, "")
+    ner.add_label("GPE")
+    doc = ruler(doc)
+    # Get into the state just before "New"
+    state = ner.moves.init_batch([doc])[0]
+    ner.moves.apply_transition(state, "O")
+    ner.moves.apply_transition(state, "O")
+    ner.moves.apply_transition(state, "O")
+    # Check that B-GPE is valid.
+    assert ner.moves.is_valid(state, "B-GPE")
+
+
+@pytest.mark.issue(4849)
+def test_issue4849():
+    nlp = English()
+    patterns = [
+        {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
+        {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
+    ]
+    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+    ruler.add_patterns(patterns)
+    text = """
+    The left is starting to take aim at Democratic front-runner Joe Biden.
+    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
+    """
+    # USING 1 PROCESS
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=1):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert count_ents == 2
+    # USING 2 PROCESSES
+    if isinstance(get_current_ops, NumpyOps):
+        count_ents = 0
+        for doc in nlp.pipe([text], n_process=2):
+            count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+        assert count_ents == 2
+
+
+@pytest.mark.issue(5918)
+def test_issue5918():
+    # Test edge case when merging entities.
+    nlp = English()
+    ruler = nlp.add_pipe("entity_ruler")
+    patterns = [
+        {"label": "ORG", "pattern": "Digicon Inc"},
+        {"label": "ORG", "pattern": "Rotan Mosle Inc's"},
+        {"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"},
+    ]
+    ruler.add_patterns(patterns)
+
+    text = """
+        Digicon Inc said it has completed the previously-announced disposition
+        of its computer systems division to an investment group led by
+        Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate.
+        """
+    doc = nlp(text)
+    assert len(doc.ents) == 3
+    # make it so that the third span's head is within the entity (ent_iob=I)
+    # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
+    # TODO: test for logging here
+    # with pytest.warns(UserWarning):
+    #     doc[29].head = doc[33]
+    doc = merge_entities(doc)
+    assert len(doc.ents) == 3
+
+
+@pytest.mark.issue(8168)
+def test_issue8168():
+    nlp = English()
+    ruler = nlp.add_pipe("entity_ruler")
+    patterns = [
+        {"label": "ORG", "pattern": "Apple"},
+        {
+            "label": "GPE",
+            "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}],
+            "id": "san-francisco",
+        },
+        {
+            "label": "GPE",
+            "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}],
+            "id": "san-francisco",
+        },
+    ]
+    ruler.add_patterns(patterns)
+
+    assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")}
+
+
+@pytest.mark.issue(8216)
+def test_entity_ruler_fix8216(nlp, patterns):
+    """Test that patterns don't get added excessively."""
+    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
+    ruler.add_patterns(patterns)
+    pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
+    assert pattern_count > 0
+    ruler.add_patterns([])
+    after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
+    assert after_count == pattern_count
+
+
 def test_entity_ruler_init(nlp, patterns):
     ruler = EntityRuler(nlp, patterns=patterns)
     assert len(ruler) == len(patterns)
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 0c2554727..4128e2a48 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -1,4 +1,6 @@
 import pytest
+
+import spacy
 from spacy.language import Language
 from spacy.lang.en import English
 from spacy.lang.de import German
@@ -11,6 +13,37 @@ from pydantic import StrictInt, StrictStr
 from ..util import make_tempdir
 
 
+@pytest.mark.issue(5137)
+def test_issue5137():
+    factory_name = "test_issue5137"
+    pipe_name = "my_component"
+
+    @Language.factory(factory_name)
+    class MyComponent:
+        def __init__(self, nlp, name=pipe_name, categories="all_categories"):
+            self.nlp = nlp
+            self.categories = categories
+            self.name = name
+
+        def __call__(self, doc):
+            pass
+
+        def to_disk(self, path, **kwargs):
+            pass
+
+        def from_disk(self, path, **cfg):
+            pass
+
+    nlp = English()
+    my_component = nlp.add_pipe(factory_name, name=pipe_name)
+    assert my_component.categories == "all_categories"
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir)
+        overrides = {"components": {pipe_name: {"categories": "my_categories"}}}
+        nlp2 = spacy.load(tmpdir, config=overrides)
+        assert nlp2.get_pipe(pipe_name).categories == "my_categories"
+
+
 def test_pipe_function_component():
     name = "test_component"
 
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 87fd64307..4b8fb8ebc 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -1,9 +1,17 @@
+import gc
+
+import numpy
 import pytest
+from thinc.api import get_current_ops
+
+from spacy.lang.en import English
+from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.tokens import Doc
 from spacy.training import Example
 from spacy.util import SimpleFrozenList, get_arg_names
-from spacy.lang.en import English
+from spacy.vocab import Vocab
 
 
 @pytest.fixture
@@ -21,6 +29,138 @@ def other_pipe(doc):
     return doc
 
 
+@pytest.mark.issue(1506)
+def test_issue1506():
+    def string_generator():
+        for _ in range(10001):
+            yield "It's sentence produced by that bug."
+        for _ in range(10001):
+            yield "I erase some hbdsaj lemmas."
+        for _ in range(10001):
+            yield "I erase lemmas."
+        for _ in range(10001):
+            yield "It's sentence produced by that bug."
+        for _ in range(10001):
+            yield "It's sentence produced by that bug."
+
+    nlp = English()
+    for i, d in enumerate(nlp.pipe(string_generator())):
+        # We should run cleanup more than one time to actually cleanup data.
+        # In first run — clean up only mark strings as «not hitted».
+        if i == 10000 or i == 20000 or i == 30000:
+            gc.collect()
+        for t in d:
+            str(t.lemma_)
+
+
+@pytest.mark.issue(1654)
+def test_issue1654():
+    nlp = Language(Vocab())
+    assert not nlp.pipeline
+
+    @Language.component("component")
+    def component(doc):
+        return doc
+
+    nlp.add_pipe("component", name="1")
+    nlp.add_pipe("component", name="2", after="1")
+    nlp.add_pipe("component", name="3", after="2")
+    assert nlp.pipe_names == ["1", "2", "3"]
+    nlp2 = Language(Vocab())
+    assert not nlp2.pipeline
+    nlp2.add_pipe("component", name="3")
+    nlp2.add_pipe("component", name="2", before="3")
+    nlp2.add_pipe("component", name="1", before="2")
+    assert nlp2.pipe_names == ["1", "2", "3"]
+
+
+@pytest.mark.issue(3880)
+def test_issue3880():
+    """Test that `nlp.pipe()` works when an empty string ends the batch.
+
+    Fixed in v7.0.5 of Thinc.
+    """
+    texts = ["hello", "world", "", ""]
+    nlp = English()
+    nlp.add_pipe("parser").add_label("dep")
+    nlp.add_pipe("ner").add_label("PERSON")
+    nlp.add_pipe("tagger").add_label("NN")
+    nlp.initialize()
+    for doc in nlp.pipe(texts):
+        pass
+
+
+@pytest.mark.issue(5082)
+def test_issue5082():
+    # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
+    nlp = English()
+    vocab = nlp.vocab
+    array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32)
+    array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32)
+    array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32)
+    array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32)
+    array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32)
+    vocab.set_vector("I", array1)
+    vocab.set_vector("like", array2)
+    vocab.set_vector("David", array3)
+    vocab.set_vector("Bowie", array4)
+    text = "I like David Bowie"
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
+    ]
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
+    parsed_vectors_1 = [t.vector for t in nlp(text)]
+    assert len(parsed_vectors_1) == 4
+    ops = get_current_ops()
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4)
+    nlp.add_pipe("merge_entities")
+    parsed_vectors_2 = [t.vector for t in nlp(text)]
+    assert len(parsed_vectors_2) == 3
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34)
+
+
+@pytest.mark.issue(5458)
+def test_issue5458():
+    # Test that the noun chuncker does not generate overlapping spans
+    # fmt: off
+    words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
+    vocab = Vocab(strings=words)
+    deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
+    pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
+    heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
+    # fmt: on
+    en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps)
+    en_doc.noun_chunks_iterator = noun_chunks
+
+    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
+    nlp = English()
+    merge_nps = nlp.create_pipe("merge_noun_chunks")
+    merge_nps(en_doc)
+
+
+def test_multiple_predictions():
+    class DummyPipe(TrainablePipe):
+        def __init__(self):
+            self.model = "dummy_model"
+
+        def predict(self, docs):
+            return ([1, 2, 3], [4, 5, 6])
+
+        def set_annotations(self, docs, scores):
+            return docs
+
+    nlp = Language()
+    doc = nlp.make_doc("foo")
+    dummy_pipe = DummyPipe()
+    dummy_pipe(doc)
+
+
 def test_add_pipe_no_name(nlp):
     nlp.add_pipe("new_pipe")
     assert "new_pipe" in nlp.pipe_names
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index ec14b70da..96e75851e 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -6,10 +6,27 @@ from spacy import util
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.language import Language
+from thinc.api import compounding
 
 from ..util import make_tempdir
 
 
+@pytest.mark.issue(4348)
+def test_issue4348():
+    """Test that training the tagger with empty data, doesn't throw errors"""
+    nlp = English()
+    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
+    TRAIN_DATA = [example, example]
+    tagger = nlp.add_pipe("tagger")
+    tagger.add_label("A")
+    optimizer = nlp.initialize()
+    for i in range(5):
+        losses = {}
+        batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        for batch in batches:
+            nlp.update(batch, sgd=optimizer, losses=losses)
+
+
 def test_label_types():
     nlp = Language()
     tagger = nlp.add_pipe("tagger")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index b134b8508..282789f2b 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,20 +1,31 @@
-import pytest
 import random
+
 import numpy.random
+import pytest
 from numpy.testing import assert_almost_equal
-from thinc.api import fix_random_seed
+from thinc.api import Config, compounding, fix_random_seed, get_current_ops
+from wasabi import msg
+
+import spacy
 from spacy import util
+from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TextCategorizer
-from spacy.tokens import Doc
+from spacy.pipeline.textcat import single_label_bow_config
+from spacy.pipeline.textcat import single_label_cnn_config
+from spacy.pipeline.textcat import single_label_default_config
+from spacy.pipeline.textcat_multilabel import multi_label_bow_config
+from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
+from spacy.pipeline.textcat_multilabel import multi_label_default_config
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
+from spacy.tokens import Doc, DocBin
 from spacy.training import Example
+from spacy.training.initialize import init_nlp
 
 from ..util import make_tempdir
 
-
 TRAIN_DATA_SINGLE_LABEL = [
     ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
     ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
@@ -48,6 +59,224 @@ def make_get_examples_multi_label(nlp):
     return get_examples
 
 
+@pytest.mark.issue(3611)
+def test_issue3611():
+    """Test whether adding n-grams in the textcat works even when n > token length of some docs"""
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
+    y_train = ["offensive", "offensive", "inoffensive"]
+    nlp = spacy.blank("en")
+    # preparing the data
+    train_data = []
+    for text, train_instance in zip(x_train, y_train):
+        cat_dict = {label: label == train_instance for label in unique_classes}
+        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+    # add a text categorizer component
+    model = {
+        "@architectures": "spacy.TextCatBOW.v1",
+        "exclusive_classes": True,
+        "ngram_size": 2,
+        "no_output_layer": False,
+    }
+    textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
+    for label in unique_classes:
+        textcat.add_label(label)
+    # training the network
+    with nlp.select_pipes(enable="textcat"):
+        optimizer = nlp.initialize()
+        for i in range(3):
+            losses = {}
+            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
+
+
+@pytest.mark.issue(4030)
+def test_issue4030():
+    """Test whether textcat works fine with empty doc"""
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
+    y_train = ["offensive", "offensive", "inoffensive"]
+    nlp = spacy.blank("en")
+    # preparing the data
+    train_data = []
+    for text, train_instance in zip(x_train, y_train):
+        cat_dict = {label: label == train_instance for label in unique_classes}
+        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+    # add a text categorizer component
+    model = {
+        "@architectures": "spacy.TextCatBOW.v1",
+        "exclusive_classes": True,
+        "ngram_size": 2,
+        "no_output_layer": False,
+    }
+    textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
+    for label in unique_classes:
+        textcat.add_label(label)
+    # training the network
+    with nlp.select_pipes(enable="textcat"):
+        optimizer = nlp.initialize()
+        for i in range(3):
+            losses = {}
+            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
+    # processing of an empty doc should result in 0.0 for all categories
+    doc = nlp("")
+    assert doc.cats["offensive"] == 0.0
+    assert doc.cats["inoffensive"] == 0.0
+
+
+@pytest.mark.parametrize(
+    "textcat_config",
+    [
+        single_label_default_config,
+        single_label_bow_config,
+        single_label_cnn_config,
+        multi_label_default_config,
+        multi_label_bow_config,
+        multi_label_cnn_config,
+    ],
+)
+@pytest.mark.issue(5551)
+def test_issue5551(textcat_config):
+    """Test that after fixing the random seed, the results of the pipeline are truly identical"""
+    component = "textcat"
+
+    pipe_cfg = Config().from_str(textcat_config)
+    results = []
+    for i in range(3):
+        fix_random_seed(0)
+        nlp = English()
+        text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g."
+        annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}
+        pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
+        for label in set(annots["cats"]):
+            pipe.add_label(label)
+        # Train
+        nlp.initialize()
+        doc = nlp.make_doc(text)
+        nlp.update([Example.from_dict(doc, annots)])
+        # Store the result of each iteration
+        result = pipe.model.predict([doc])
+        results.append(result[0])
+    # All results should be the same because of the fixed seed
+    assert len(results) == 3
+    ops = get_current_ops()
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
+
+
+CONFIG_ISSUE_6908 = """
+[paths]
+train = "TRAIN_PLACEHOLDER"
+raw = null
+init_tok2vec = null
+vectors = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "en"
+pipeline = ["textcat"]
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+
+[components]
+
+[components.textcat]
+factory = "TEXTCAT_PLACEHOLDER"
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+
+
+[training]
+train_corpus = "corpora.train"
+dev_corpus = "corpora.dev"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+frozen_components = []
+before_to_disk = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.components.textcat]
+labels = ['label1', 'label2']
+
+[initialize.tokenizer]
+"""
+
+
+@pytest.mark.parametrize(
+    "component_name",
+    ["textcat", "textcat_multilabel"],
+)
+@pytest.mark.issue(6908)
+def test_issue6908(component_name):
+    """Test intializing textcat with labels in a list"""
+
+    def create_data(out_file):
+        nlp = spacy.blank("en")
+        doc = nlp.make_doc("Some text")
+        doc.cats = {"label1": 0, "label2": 1}
+        out_data = DocBin(docs=[doc]).to_bytes()
+        with out_file.open("wb") as file_:
+            file_.write(out_data)
+
+    with make_tempdir() as tmp_path:
+        train_path = tmp_path / "train.spacy"
+        create_data(train_path)
+        config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name)
+        config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
+        config = util.load_config_from_str(config_str)
+        init_nlp(config)
+
+
+@pytest.mark.issue(7019)
+def test_issue7019():
+    scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None}
+    print_textcats_auc_per_cat(msg, scores)
+    scores = {
+        "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932},
+        "LABEL_B": {"p": None, "r": None, "f": None},
+    }
+    print_prf_per_type(msg, scores, name="foo", type="bar")
+
+
 @pytest.mark.skip(reason="Test is flakey when run with others")
 def test_simple_train():
     nlp = Language()
diff --git a/spacy/tests/regression/__init__.py b/spacy/tests/regression/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
deleted file mode 100644
index 4846d2075..000000000
--- a/spacy/tests/regression/test_issue1-1000.py
+++ /dev/null
@@ -1,486 +0,0 @@
-import pytest
-import random
-from spacy import util
-from spacy.training import Example
-from spacy.matcher import Matcher
-from spacy.attrs import IS_PUNCT, ORTH, LOWER
-from spacy.vocab import Vocab
-from spacy.lang.en import English
-from spacy.lookups import Lookups
-from spacy.tokens import Doc, Span
-
-from ..util import make_tempdir
-
-
-@pytest.mark.issue(118)
-@pytest.mark.parametrize(
-    "patterns",
-    [
-        [[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
-        [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]],
-    ],
-)
-def test_issue118(en_tokenizer, patterns):
-    """Test a bug that arose from having overlapping matches"""
-    text = (
-        "how many points did lebron james score against the boston celtics last night"
-    )
-    doc = en_tokenizer(text)
-    ORG = doc.vocab.strings["ORG"]
-    matcher = Matcher(doc.vocab)
-    matcher.add("BostonCeltics", patterns)
-    assert len(list(doc.ents)) == 0
-    matches = [(ORG, start, end) for _, start, end in matcher(doc)]
-    assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
-    doc.ents = matches[:1]
-    ents = list(doc.ents)
-    assert len(ents) == 1
-    assert ents[0].label == ORG
-    assert ents[0].start == 9
-    assert ents[0].end == 11
-
-
-@pytest.mark.issue(118)
-@pytest.mark.parametrize(
-    "patterns",
-    [
-        [[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
-        [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]],
-    ],
-)
-def test_issue118_prefix_reorder(en_tokenizer, patterns):
-    """Test a bug that arose from having overlapping matches"""
-    text = (
-        "how many points did lebron james score against the boston celtics last night"
-    )
-    doc = en_tokenizer(text)
-    ORG = doc.vocab.strings["ORG"]
-    matcher = Matcher(doc.vocab)
-    matcher.add("BostonCeltics", patterns)
-    assert len(list(doc.ents)) == 0
-    matches = [(ORG, start, end) for _, start, end in matcher(doc)]
-    doc.ents += tuple(matches)[1:]
-    assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
-    ents = doc.ents
-    assert len(ents) == 1
-    assert ents[0].label == ORG
-    assert ents[0].start == 9
-    assert ents[0].end == 11
-
-
-@pytest.mark.issue(242)
-def test_issue242(en_tokenizer):
-    """Test overlapping multi-word phrases."""
-    text = "There are different food safety standards in different countries."
-    patterns = [
-        [{"LOWER": "food"}, {"LOWER": "safety"}],
-        [{"LOWER": "safety"}, {"LOWER": "standards"}],
-    ]
-    doc = en_tokenizer(text)
-    matcher = Matcher(doc.vocab)
-    matcher.add("FOOD", patterns)
-    matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
-    match1, match2 = matches
-    assert match1[1] == 3
-    assert match1[2] == 5
-    assert match2[1] == 4
-    assert match2[2] == 6
-    with pytest.raises(ValueError):
-        # One token can only be part of one entity, so test that the matches
-        # can't be added as entities
-        doc.ents += tuple(matches)
-
-
-@pytest.mark.issue(309)
-def test_issue309(en_vocab):
-    """Test Issue #309: SBD fails on empty string"""
-    doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
-    assert len(doc) == 1
-    sents = list(doc.sents)
-    assert len(sents) == 1
-
-
-@pytest.mark.issue(351)
-def test_issue351(en_tokenizer):
-    doc = en_tokenizer("   This is a cat.")
-    assert doc[0].idx == 0
-    assert len(doc[0]) == 3
-    assert doc[1].idx == 3
-
-
-@pytest.mark.issue(360)
-def test_issue360(en_tokenizer):
-    """Test tokenization of big ellipsis"""
-    tokens = en_tokenizer("$45...............Asking")
-    assert len(tokens) > 2
-
-
-@pytest.mark.issue(361)
-@pytest.mark.parametrize("text1,text2", [("cat", "dog")])
-def test_issue361(en_vocab, text1, text2):
-    """Test Issue #361: Equality of lexemes"""
-    assert en_vocab[text1] == en_vocab[text1]
-    assert en_vocab[text1] != en_vocab[text2]
-
-
-@pytest.mark.issue(587)
-def test_issue587(en_tokenizer):
-    """Test that Matcher doesn't segfault on particular input"""
-    doc = en_tokenizer("a b; c")
-    matcher = Matcher(doc.vocab)
-    matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]])
-    matches = matcher(doc)
-    assert len(matches) == 1
-    matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]])
-    matches = matcher(doc)
-    assert len(matches) == 2
-    matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]])
-    matches = matcher(doc)
-    assert len(matches) == 2
-
-
-@pytest.mark.issue(588)
-def test_issue588(en_vocab):
-    matcher = Matcher(en_vocab)
-    with pytest.raises(ValueError):
-        matcher.add("TEST", [[]])
-
-
-@pytest.mark.issue(590)
-def test_issue590(en_vocab):
-    """Test overlapping matches"""
-    doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
-    matcher = Matcher(en_vocab)
-    matcher.add(
-        "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]]
-    )
-    matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]])
-    matches = matcher(doc)
-    assert len(matches) == 2
-
-
-@pytest.mark.issue(595)
-@pytest.mark.skip(reason="Old vocab-based lemmatization")
-def test_issue595():
-    """Test lemmatization of base forms"""
-    words = ["Do", "n't", "feed", "the", "dog"]
-    lookups = Lookups()
-    lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
-    lookups.add_table("lemma_index", {"verb": {}})
-    lookups.add_table("lemma_exc", {"verb": {}})
-    vocab = Vocab()
-    doc = Doc(vocab, words=words)
-    doc[2].tag_ = "VB"
-    assert doc[2].text == "feed"
-    assert doc[2].lemma_ == "feed"
-
-
-@pytest.mark.issue(599)
-def test_issue599(en_vocab):
-    doc = Doc(en_vocab)
-    doc2 = Doc(doc.vocab)
-    doc2.from_bytes(doc.to_bytes())
-    assert doc2.has_annotation("DEP")
-
-
-@pytest.mark.issue(600)
-def test_issue600():
-    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
-    doc = Doc(vocab, words=["hello"])
-    doc[0].tag_ = "NN"
-
-
-@pytest.mark.issue(615)
-def test_issue615(en_tokenizer):
-    def merge_phrases(matcher, doc, i, matches):
-        """Merge a phrase. We have to be careful here because we'll change the
-        token indices. To avoid problems, merge all the phrases once we're called
-        on the last match."""
-        if i != len(matches) - 1:
-            return None
-        spans = [Span(doc, start, end, label=label) for label, start, end in matches]
-        with doc.retokenize() as retokenizer:
-            for span in spans:
-                tag = "NNP" if span.label_ else span.root.tag_
-                attrs = {"tag": tag, "lemma": span.text}
-                retokenizer.merge(span, attrs=attrs)
-                doc.ents = doc.ents + (span,)
-
-    text = "The golf club is broken"
-    pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
-    label = "Sport_Equipment"
-    doc = en_tokenizer(text)
-    matcher = Matcher(doc.vocab)
-    matcher.add(label, [pattern], on_match=merge_phrases)
-    matcher(doc)
-    entities = list(doc.ents)
-    assert entities != []
-    assert entities[0].label != 0
-
-
-@pytest.mark.issue(736)
-@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
-def test_issue736(en_tokenizer, text, number):
-    """Test that times like "7am" are tokenized correctly and that numbers are
-    converted to string."""
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 2
-    assert tokens[0].text == number
-
-
-@pytest.mark.issue(740)
-@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
-def test_issue740(en_tokenizer, text):
-    """Test that dates are not split and kept as one token. This behaviour is
-    currently inconsistent, since dates separated by hyphens are still split.
-    This will be hard to prevent without causing clashes with numeric ranges."""
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 1
-
-
-@pytest.mark.issue(743)
-def test_issue743():
-    doc = Doc(Vocab(), ["hello", "world"])
-    token = doc[0]
-    s = set([token])
-    items = list(s)
-    assert items[0] is token
-
-
-@pytest.mark.issue(744)
-@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
-def test_issue744(en_tokenizer, text):
-    """Test that 'were' and 'Were' are excluded from the contractions
-    generated by the English tokenizer exceptions."""
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 3
-    assert tokens[1].text.lower() == "were"
-
-
-@pytest.mark.issue(759)
-@pytest.mark.parametrize(
-    "text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
-)
-def test_issue759(en_tokenizer, text, is_num):
-    tokens = en_tokenizer(text)
-    assert tokens[0].like_num == is_num
-
-
-@pytest.mark.issue(775)
-@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
-def test_issue775(en_tokenizer, text):
-    """Test that 'Shell' and 'shell' are excluded from the contractions
-    generated by the English tokenizer exceptions."""
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 1
-    assert tokens[0].text == text
-
-
-@pytest.mark.issue(792)
-@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
-def test_issue792(en_tokenizer, text):
-    """Test for Issue #792: Trailing whitespace is removed after tokenization."""
-    doc = en_tokenizer(text)
-    assert "".join([token.text_with_ws for token in doc]) == text
-
-
-@pytest.mark.issue(792)
-@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
-def test_control_issue792(en_tokenizer, text):
-    """Test base case for Issue #792: Non-trailing whitespace"""
-    doc = en_tokenizer(text)
-    assert "".join([token.text_with_ws for token in doc]) == text
-
-
-@pytest.mark.issue(801)
-@pytest.mark.skip(
-    reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218"
-)
-@pytest.mark.parametrize(
-    "text,tokens",
-    [
-        ('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
-        ("exception;--exclusive", ["exception", ";--", "exclusive"]),
-        ("day.--Is", ["day", ".--", "Is"]),
-        ("refinement:--just", ["refinement", ":--", "just"]),
-        ("memories?--To", ["memories", "?--", "To"]),
-        ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
-        ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]),
-    ],
-)
-def test_issue801(en_tokenizer, text, tokens):
-    """Test that special characters + hyphens are split correctly."""
-    doc = en_tokenizer(text)
-    assert len(doc) == len(tokens)
-    assert [t.text for t in doc] == tokens
-
-
-@pytest.mark.issue(805)
-@pytest.mark.parametrize(
-    "text,expected_tokens",
-    [
-        (
-            "Smörsåsen används bl.a. till fisk",
-            ["Smörsåsen", "används", "bl.a.", "till", "fisk"],
-        ),
-        (
-            "Jag kommer först kl. 13 p.g.a. diverse förseningar",
-            ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
-        ),
-    ],
-)
-def test_issue805(sv_tokenizer, text, expected_tokens):
-    tokens = sv_tokenizer(text)
-    token_list = [token.text for token in tokens if not token.is_space]
-    assert expected_tokens == token_list
-
-
-@pytest.mark.issue(850)
-def test_issue850():
-    """The variable-length pattern matches the succeeding token. Check we
-    handle the ambiguity correctly."""
-    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
-    matcher = Matcher(vocab)
-    pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}]
-    matcher.add("FarAway", [pattern])
-    doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
-    match = matcher(doc)
-    assert len(match) == 1
-    ent_id, start, end = match[0]
-    assert start == 0
-    assert end == 4
-
-
-@pytest.mark.issue(850)
-def test_issue850_basic():
-    """Test Matcher matches with '*' operator and Boolean flag"""
-    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
-    matcher = Matcher(vocab)
-    pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
-    matcher.add("FarAway", [pattern])
-    doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
-    match = matcher(doc)
-    assert len(match) == 1
-    ent_id, start, end = match[0]
-    assert start == 0
-    assert end == 4
-
-
-@pytest.mark.issue(852)
-@pytest.mark.skip(
-    reason="French exception list is not enabled in the default tokenizer anymore"
-)
-@pytest.mark.parametrize(
-    "text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
-)
-def test_issue852(fr_tokenizer, text):
-    """Test that French tokenizer exceptions are imported correctly."""
-    tokens = fr_tokenizer(text)
-    assert len(tokens) == 1
-
-
-@pytest.mark.issue(859)
-@pytest.mark.parametrize(
-    "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
-)
-def test_issue859(en_tokenizer, text):
-    """Test that no extra space is added in doc.text method."""
-    doc = en_tokenizer(text)
-    assert doc.text == text
-
-
-@pytest.mark.issue(886)
-@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
-def test_issue886(en_tokenizer, text):
-    """Test that token.idx matches the original text index for texts with newlines."""
-    doc = en_tokenizer(text)
-    for token in doc:
-        assert len(token.text) == len(token.text_with_ws)
-        assert text[token.idx] == token.text[0]
-
-
-@pytest.mark.issue(891)
-@pytest.mark.parametrize("text", ["want/need"])
-def test_issue891(en_tokenizer, text):
-    """Test that / infixes are split correctly."""
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 3
-    assert tokens[1].text == "/"
-
-
-@pytest.mark.issue(912)
-@pytest.mark.skip(reason="Old vocab-based lemmatization")
-@pytest.mark.parametrize(
-    "text,tag,lemma",
-    [("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")],
-)
-def test_issue912(en_vocab, text, tag, lemma):
-    """Test base-forms are preserved."""
-    doc = Doc(en_vocab, words=[text])
-    doc[0].tag_ = tag
-    assert doc[0].lemma_ == lemma
-
-
-@pytest.mark.issue(957)
-@pytest.mark.slow
-def test_issue957(en_tokenizer):
-    """Test that spaCy doesn't hang on many punctuation characters.
-    If this test hangs, check (new) regular expressions for conflicting greedy operators
-    """
-    # Skip test if pytest-timeout is not installed
-    pytest.importorskip("pytest_timeout")
-    for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]:
-        string = "0"
-        for i in range(1, 100):
-            string += punct + str(i)
-        doc = en_tokenizer(string)
-        assert doc
-
-
-@pytest.mark.issue(999)
-def test_issue999():
-    """Test that adding entities and resuming training works passably OK.
-    There are two issues here:
-    1) We have to re-add labels. This isn't very nice.
-    2) There's no way to set the learning rate for the weight update, so we
-        end up out-of-scale, causing it to learn too fast.
-    """
-    TRAIN_DATA = [
-        ["hey", []],
-        ["howdy", []],
-        ["hey there", []],
-        ["hello", []],
-        ["hi", []],
-        ["i'm looking for a place to eat", []],
-        ["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]],
-        ["show me chinese restaurants", [(8, 15, "CUISINE")]],
-        ["show me chines restaurants", [(8, 14, "CUISINE")]],
-    ]
-    nlp = English()
-    ner = nlp.add_pipe("ner")
-    for _, offsets in TRAIN_DATA:
-        for start, end, label in offsets:
-            ner.add_label(label)
-    nlp.initialize()
-    for itn in range(20):
-        random.shuffle(TRAIN_DATA)
-        for raw_text, entity_offsets in TRAIN_DATA:
-            example = Example.from_dict(
-                nlp.make_doc(raw_text), {"entities": entity_offsets}
-            )
-            nlp.update([example])
-
-    with make_tempdir() as model_dir:
-        nlp.to_disk(model_dir)
-        nlp2 = util.load_model_from_path(model_dir)
-
-    for raw_text, entity_offsets in TRAIN_DATA:
-        doc = nlp2(raw_text)
-        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
-        for start, end, label in entity_offsets:
-            if (start, end) in ents:
-                assert ents[(start, end)] == label
-                break
-            else:
-                if entity_offsets:
-                    raise Exception(ents)
diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py
deleted file mode 100644
index 0a60e4477..000000000
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import pytest
-import re
-from spacy.tokens import Doc
-from spacy.vocab import Vocab
-from spacy.lang.en import English
-from spacy.lang.lex_attrs import LEX_ATTRS
-from spacy.matcher import Matcher
-from spacy.tokenizer import Tokenizer
-from spacy.symbols import ORTH, LEMMA, POS
-
-
-@pytest.mark.issue(1061)
-def test_issue1061():
-    """Test special-case works after tokenizing. Was caching problem."""
-    text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
-    tokenizer = English().tokenizer
-    doc = tokenizer(text)
-    assert "MATH" in [w.text for w in doc]
-    assert "_MATH_" not in [w.text for w in doc]
-
-    tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
-    doc = tokenizer(text)
-    assert "_MATH_" in [w.text for w in doc]
-    assert "MATH" not in [w.text for w in doc]
-
-    # For sanity, check it works when pipeline is clean.
-    tokenizer = English().tokenizer
-    tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
-    doc = tokenizer(text)
-    assert "_MATH_" in [w.text for w in doc]
-    assert "MATH" not in [w.text for w in doc]
-
-
-@pytest.mark.skip(
-    reason="Can not be fixed without variable-width look-behind (which we don't want)"
-)
-@pytest.mark.issue(1235)
-def test_issue1235():
-    """Test that g is not split of if preceded by a number and a letter"""
-    nlp = English()
-    testwords = "e2g 2g 52g"
-    doc = nlp(testwords)
-    assert len(doc) == 5
-    assert doc[0].text == "e2g"
-    assert doc[1].text == "2"
-    assert doc[2].text == "g"
-    assert doc[3].text == "52"
-    assert doc[4].text == "g"
-
-
-@pytest.mark.issue(1242)
-def test_issue1242():
-    nlp = English()
-    doc = nlp("")
-    assert len(doc) == 0
-    docs = list(nlp.pipe(["", "hello"]))
-    assert len(docs[0]) == 0
-    assert len(docs[1]) == 1
-
-
-@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases")
-@pytest.mark.issue(1250)
-def test_issue1250():
-    """Test cached special cases."""
-    special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
-    nlp = English()
-    nlp.tokenizer.add_special_case("reimbur", special_case)
-    lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
-    assert lemmas == ["reimburse", ",", "reimburse", "..."]
-    lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
-    assert lemmas == ["reimburse", ",", "reimburse", "..."]
-
-
-@pytest.mark.issue(1257)
-def test_issue1257():
-    """Test that tokens compare correctly."""
-    doc1 = Doc(Vocab(), words=["a", "b", "c"])
-    doc2 = Doc(Vocab(), words=["a", "c", "e"])
-    assert doc1[0] != doc2[0]
-    assert not doc1[0] == doc2[0]
-
-
-@pytest.mark.issue(1375)
-def test_issue1375():
-    """Test that token.nbor() raises IndexError for out-of-bounds access."""
-    doc = Doc(Vocab(), words=["0", "1", "2"])
-    with pytest.raises(IndexError):
-        assert doc[0].nbor(-1)
-    assert doc[1].nbor(-1).text == "0"
-    with pytest.raises(IndexError):
-        assert doc[2].nbor(1)
-    assert doc[1].nbor(1).text == "2"
-
-
-@pytest.mark.issue(1434)
-def test_issue1434():
-    """Test matches occur when optional element at end of short doc."""
-    pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
-    vocab = Vocab(lex_attr_getters=LEX_ATTRS)
-    hello_world = Doc(vocab, words=["Hello", "World"])
-    hello = Doc(vocab, words=["Hello"])
-    matcher = Matcher(vocab)
-    matcher.add("MyMatcher", [pattern])
-    matches = matcher(hello_world)
-    assert matches
-    matches = matcher(hello)
-    assert matches
-
-
-@pytest.mark.parametrize(
-    "string,start,end",
-    [
-        ("a", 0, 1),
-        ("a b", 0, 2),
-        ("a c", 0, 1),
-        ("a b c", 0, 2),
-        ("a b b c", 0, 3),
-        ("a b b", 0, 3),
-    ],
-)
-@pytest.mark.issue(1450)
-def test_issue1450(string, start, end):
-    """Test matcher works when patterns end with * operator."""
-    pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
-    matcher = Matcher(Vocab())
-    matcher.add("TSTEND", [pattern])
-    doc = Doc(Vocab(), words=string.split())
-    matches = matcher(doc)
-    if start is None or end is None:
-        assert matches == []
-    assert matches[-1][1] == start
-    assert matches[-1][2] == end
-
-
-@pytest.mark.issue(1488)
-def test_issue1488():
-    prefix_re = re.compile(r"""[\[\("']""")
-    suffix_re = re.compile(r"""[\]\)"']""")
-    infix_re = re.compile(r"""[-~\.]""")
-    simple_url_re = re.compile(r"""^https?://""")
-
-    def my_tokenizer(nlp):
-        return Tokenizer(
-            nlp.vocab,
-            {},
-            prefix_search=prefix_re.search,
-            suffix_search=suffix_re.search,
-            infix_finditer=infix_re.finditer,
-            token_match=simple_url_re.match,
-        )
-
-    nlp = English()
-    nlp.tokenizer = my_tokenizer(nlp)
-    doc = nlp("This is a test.")
-    for token in doc:
-        assert token.text
-
-
-@pytest.mark.issue(1494)
-def test_issue1494():
-    infix_re = re.compile(r"""[^a-z]""")
-    test_cases = [
-        ("token 123test", ["token", "1", "2", "3", "test"]),
-        ("token 1test", ["token", "1test"]),
-        ("hello...test", ["hello", ".", ".", ".", "test"]),
-    ]
-
-    def new_tokenizer(nlp):
-        return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
-
-    nlp = English()
-    nlp.tokenizer = new_tokenizer(nlp)
-    for text, expected in test_cases:
-        assert [token.text for token in nlp(text)] == expected
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
deleted file mode 100644
index 07f173843..000000000
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ /dev/null
@@ -1,375 +0,0 @@
-import pytest
-import gc
-import numpy
-import copy
-
-from spacy.training import Example
-from spacy.lang.en import English
-from spacy.lang.en.stop_words import STOP_WORDS
-from spacy.lang.lex_attrs import is_stop
-from spacy.vectors import Vectors
-from spacy.vocab import Vocab
-from spacy.language import Language
-from spacy.tokens import Doc, Span, Token
-from spacy.attrs import HEAD, DEP
-from spacy.matcher import Matcher
-
-from ..util import make_tempdir
-
-
-@pytest.mark.issue(1506)
-def test_issue1506():
-    def string_generator():
-        for _ in range(10001):
-            yield "It's sentence produced by that bug."
-        for _ in range(10001):
-            yield "I erase some hbdsaj lemmas."
-        for _ in range(10001):
-            yield "I erase lemmas."
-        for _ in range(10001):
-            yield "It's sentence produced by that bug."
-        for _ in range(10001):
-            yield "It's sentence produced by that bug."
-
-    nlp = English()
-    for i, d in enumerate(nlp.pipe(string_generator())):
-        # We should run cleanup more than one time to actually cleanup data.
-        # In first run — clean up only mark strings as «not hitted».
-        if i == 10000 or i == 20000 or i == 30000:
-            gc.collect()
-        for t in d:
-            str(t.lemma_)
-
-
-@pytest.mark.issue(1518)
-def test_issue1518():
-    """Test vectors.resize() works."""
-    vectors = Vectors(shape=(10, 10))
-    vectors.add("hello", row=2)
-    vectors.resize((5, 9))
-
-
-@pytest.mark.issue(1537)
-def test_issue1537():
-    """Test that Span.as_doc() doesn't segfault."""
-    string = "The sky is blue . The man is pink . The dog is purple ."
-    doc = Doc(Vocab(), words=string.split())
-    doc[0].sent_start = True
-    for word in doc[1:]:
-        if word.nbor(-1).text == ".":
-            word.sent_start = True
-        else:
-            word.sent_start = False
-    sents = list(doc.sents)
-    sent0 = sents[0].as_doc()
-    sent1 = sents[1].as_doc()
-    assert isinstance(sent0, Doc)
-    assert isinstance(sent1, Doc)
-
-
-# TODO: Currently segfaulting, due to l_edge and r_edge misalignment
-@pytest.mark.issue(1537)
-# def test_issue1537_model():
-#    nlp = load_spacy('en')
-#    doc = nlp('The sky is blue. The man is pink. The dog is purple.')
-#    sents = [s.as_doc() for s in doc.sents]
-#    print(list(sents[0].noun_chunks))
-#    print(list(sents[1].noun_chunks))
-
-
-@pytest.mark.issue(1539)
-def test_issue1539():
-    """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
-    v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
-    v.resize((100, 100))
-
-
-@pytest.mark.issue(1547)
-def test_issue1547():
-    """Test that entity labels still match after merging tokens."""
-    words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
-    doc = Doc(Vocab(), words=words)
-    doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
-    with doc.retokenize() as retokenizer:
-        retokenizer.merge(doc[5:7])
-    assert [ent.text for ent in doc.ents]
-
-
-@pytest.mark.issue(1612)
-def test_issue1612(en_tokenizer):
-    doc = en_tokenizer("The black cat purrs.")
-    span = doc[1:3]
-    assert span.orth_ == span.text
-
-
-@pytest.mark.issue(1654)
-def test_issue1654():
-    nlp = Language(Vocab())
-    assert not nlp.pipeline
-
-    @Language.component("component")
-    def component(doc):
-        return doc
-
-    nlp.add_pipe("component", name="1")
-    nlp.add_pipe("component", name="2", after="1")
-    nlp.add_pipe("component", name="3", after="2")
-    assert nlp.pipe_names == ["1", "2", "3"]
-    nlp2 = Language(Vocab())
-    assert not nlp2.pipeline
-    nlp2.add_pipe("component", name="3")
-    nlp2.add_pipe("component", name="2", before="3")
-    nlp2.add_pipe("component", name="1", before="2")
-    assert nlp2.pipe_names == ["1", "2", "3"]
-
-
-@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
-@pytest.mark.issue(1698)
-def test_issue1698(en_tokenizer, text):
-    doc = en_tokenizer(text)
-    assert len(doc) == 1
-    assert not doc[0].like_url
-
-
-@pytest.mark.issue(1727)
-def test_issue1727():
-    """Test that models with no pretrained vectors can be deserialized
-    correctly after vectors are added."""
-    nlp = Language(Vocab())
-    data = numpy.ones((3, 300), dtype="f")
-    vectors = Vectors(data=data, keys=["I", "am", "Matt"])
-    tagger = nlp.create_pipe("tagger")
-    tagger.add_label("PRP")
-    assert tagger.cfg.get("pretrained_dims", 0) == 0
-    tagger.vocab.vectors = vectors
-    with make_tempdir() as path:
-        tagger.to_disk(path)
-        tagger = nlp.create_pipe("tagger").from_disk(path)
-        assert tagger.cfg.get("pretrained_dims", 0) == 0
-
-
-@pytest.mark.issue(1757)
-def test_issue1757():
-    """Test comparison against None doesn't cause segfault."""
-    doc = Doc(Vocab(), words=["a", "b", "c"])
-    assert not doc[0] < None
-    assert not doc[0] is None
-    assert doc[0] >= None
-    assert not doc[:2] < None
-    assert not doc[:2] is None
-    assert doc[:2] >= None
-    assert not doc.vocab["a"] is None
-    assert not doc.vocab["a"] < None
-
-
-@pytest.mark.issue(1758)
-def test_issue1758(en_tokenizer):
-    """Test that "would've" is handled by the English tokenizer exceptions."""
-    tokens = en_tokenizer("would've")
-    assert len(tokens) == 2
-
-
-@pytest.mark.issue(1773)
-def test_issue1773(en_tokenizer):
-    """Test that spaces don't receive a POS but no TAG. This is the root cause
-    of the serialization issue reported in #1773."""
-    doc = en_tokenizer("\n")
-    if doc[0].pos_ == "SPACE":
-        assert doc[0].tag_ != ""
-
-
-@pytest.mark.issue(1799)
-def test_issue1799():
-    """Test sentence boundaries are deserialized correctly, even for
-    non-projective sentences."""
-    heads_deps = numpy.asarray(
-        [
-            [1, 397],
-            [4, 436],
-            [2, 426],
-            [1, 402],
-            [0, 8206900633647566924],
-            [18446744073709551615, 440],
-            [18446744073709551614, 442],
-        ],
-        dtype="uint64",
-    )
-    doc = Doc(Vocab(), words="Just what I was looking for .".split())
-    doc.vocab.strings.add("ROOT")
-    doc = doc.from_array([HEAD, DEP], heads_deps)
-    assert len(list(doc.sents)) == 1
-
-
-@pytest.mark.issue(1807)
-def test_issue1807():
-    """Test vocab.set_vector also adds the word to the vocab."""
-    vocab = Vocab(vectors_name="test_issue1807")
-    assert "hello" not in vocab
-    vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
-    assert "hello" in vocab
-
-
-@pytest.mark.issue(1834)
-def test_issue1834():
-    """Test that sentence boundaries & parse/tag flags are not lost
-    during serialization."""
-    words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
-    doc = Doc(Vocab(), words=words)
-    doc[6].is_sent_start = True
-    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    assert new_doc[6].sent_start
-    assert not new_doc.has_annotation("DEP")
-    assert not new_doc.has_annotation("TAG")
-    doc = Doc(
-        Vocab(),
-        words=words,
-        tags=["TAG"] * len(words),
-        heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
-        deps=["dep"] * len(words),
-    )
-    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    assert new_doc[6].sent_start
-    assert new_doc.has_annotation("DEP")
-    assert new_doc.has_annotation("TAG")
-
-
-@pytest.mark.issue(1868)
-def test_issue1868():
-    """Test Vocab.__contains__ works with int keys."""
-    vocab = Vocab()
-    lex = vocab["hello"]
-    assert lex.orth in vocab
-    assert lex.orth_ in vocab
-    assert "some string" not in vocab
-    int_id = vocab.strings.add("some string")
-    assert int_id not in vocab
-
-
-@pytest.mark.issue(1883)
-def test_issue1883():
-    matcher = Matcher(Vocab())
-    matcher.add("pat1", [[{"orth": "hello"}]])
-    doc = Doc(matcher.vocab, words=["hello"])
-    assert len(matcher(doc)) == 1
-    new_matcher = copy.deepcopy(matcher)
-    new_doc = Doc(new_matcher.vocab, words=["hello"])
-    assert len(new_matcher(new_doc)) == 1
-
-
-@pytest.mark.parametrize("word", ["the"])
-@pytest.mark.issue(1889)
-def test_issue1889(word):
-    assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
-
-
-@pytest.mark.skip(reason="obsolete with the config refactor of v.3")
-@pytest.mark.issue(1915)
-def test_issue1915():
-    cfg = {"hidden_depth": 2}  # should error out
-    nlp = Language()
-    ner = nlp.add_pipe("ner")
-    ner.add_label("answer")
-    with pytest.raises(ValueError):
-        nlp.initialize(**cfg)
-
-
-@pytest.mark.issue(1945)
-def test_issue1945():
-    """Test regression in Matcher introduced in v2.0.6."""
-    matcher = Matcher(Vocab())
-    matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]])
-    doc = Doc(matcher.vocab, words=["a", "a", "a"])
-    matches = matcher(doc)  # we should see two overlapping matches here
-    assert len(matches) == 2
-    assert matches[0][1:] == (0, 2)
-    assert matches[1][1:] == (1, 3)
-
-
-@pytest.mark.issue(1963)
-def test_issue1963(en_tokenizer):
-    """Test that doc.merge() resizes doc.tensor"""
-    doc = en_tokenizer("a b c d")
-    doc.tensor = numpy.ones((len(doc), 128), dtype="f")
-    with doc.retokenize() as retokenizer:
-        retokenizer.merge(doc[0:2])
-    assert len(doc) == 3
-    assert doc.tensor.shape == (3, 128)
-
-
-@pytest.mark.parametrize("label", ["U-JOB-NAME"])
-@pytest.mark.issue(1967)
-def test_issue1967(label):
-    nlp = Language()
-    config = {}
-    ner = nlp.create_pipe("ner", config=config)
-    example = Example.from_dict(
-        Doc(ner.vocab, words=["word"]),
-        {
-            "ids": [0],
-            "words": ["word"],
-            "tags": ["tag"],
-            "heads": [0],
-            "deps": ["dep"],
-            "entities": [label],
-        },
-    )
-    assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
-
-
-@pytest.mark.issue(1971)
-def test_issue1971(en_vocab):
-    # Possibly related to #2675 and #2671?
-    matcher = Matcher(en_vocab)
-    pattern = [
-        {"ORTH": "Doe"},
-        {"ORTH": "!", "OP": "?"},
-        {"_": {"optional": True}, "OP": "?"},
-        {"ORTH": "!", "OP": "?"},
-    ]
-    Token.set_extension("optional", default=False)
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
-    # We could also assert length 1 here, but this is more conclusive, because
-    # the real problem here is that it returns a duplicate match for a match_id
-    # that's not actually in the vocab!
-    matches = matcher(doc)
-    assert all([match_id in en_vocab.strings for match_id, start, end in matches])
-
-
-def test_issue_1971_2(en_vocab):
-    matcher = Matcher(en_vocab)
-    pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
-    pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}]
-    doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
-    matcher.add("TEST1", [pattern1, pattern2])
-    matches = matcher(doc)
-    assert len(matches) == 2
-
-
-def test_issue_1971_3(en_vocab):
-    """Test that pattern matches correctly for multiple extension attributes."""
-    Token.set_extension("a", default=1, force=True)
-    Token.set_extension("b", default=2, force=True)
-    doc = Doc(en_vocab, words=["hello", "world"])
-    matcher = Matcher(en_vocab)
-    matcher.add("A", [[{"_": {"a": 1}}]])
-    matcher.add("B", [[{"_": {"b": 2}}]])
-    matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
-    assert len(matches) == 4
-    assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
-
-
-def test_issue_1971_4(en_vocab):
-    """Test that pattern matches correctly with multiple extension attribute
-    values on a single token.
-    """
-    Token.set_extension("ext_a", default="str_a", force=True)
-    Token.set_extension("ext_b", default="str_b", force=True)
-    matcher = Matcher(en_vocab)
-    doc = Doc(en_vocab, words=["this", "is", "text"])
-    pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
-    matcher.add("TEST", [pattern])
-    matches = matcher(doc)
-    # Uncommenting this caused a segmentation fault
-    assert len(matches) == 1
-    assert matches[0] == (en_vocab.strings["TEST"], 0, 3)
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
deleted file mode 100644
index a07360c2c..000000000
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import pytest
-import numpy
-from spacy.tokens import Doc
-from spacy.matcher import Matcher
-from spacy.displacy import render
-from spacy.training import iob_to_biluo
-from spacy.lang.it import Italian
-from spacy.lang.en import English
-
-from ..util import add_vecs_to_vocab
-
-
-@pytest.mark.skip(
-    reason="Can not be fixed without iterative looping between prefix/suffix and infix"
-)
-@pytest.mark.issue(2070)
-def test_issue2070():
-    """Test that checks that a dot followed by a quote is handled
-    appropriately.
-    """
-    # Problem: The dot is now properly split off, but the prefix/suffix rules
-    # are not applied again afterwards. This means that the quote will still be
-    # attached to the remaining token.
-    nlp = English()
-    doc = nlp('First sentence."A quoted sentence" he said ...')
-    assert len(doc) == 11
-
-
-@pytest.mark.issue(2179)
-def test_issue2179():
-    """Test that spurious 'extra_labels' aren't created when initializing NER."""
-    nlp = Italian()
-    ner = nlp.add_pipe("ner")
-    ner.add_label("CITIZENSHIP")
-    nlp.initialize()
-    nlp2 = Italian()
-    nlp2.add_pipe("ner")
-    assert len(nlp2.get_pipe("ner").labels) == 0
-    model = nlp2.get_pipe("ner").model
-    model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves)
-    nlp2.from_bytes(nlp.to_bytes())
-    assert "extra_labels" not in nlp2.get_pipe("ner").cfg
-    assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
-
-
-@pytest.mark.issue(2203)
-def test_issue2203(en_vocab):
-    """Test that lemmas are set correctly in doc.from_array."""
-    words = ["I", "'ll", "survive"]
-    tags = ["PRP", "MD", "VB"]
-    lemmas = ["-PRON-", "will", "survive"]
-    tag_ids = [en_vocab.strings.add(tag) for tag in tags]
-    lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
-    doc = Doc(en_vocab, words=words)
-    # Work around lemma corruption problem and set lemmas after tags
-    doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
-    doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
-    assert [t.tag_ for t in doc] == tags
-    assert [t.lemma_ for t in doc] == lemmas
-    # We need to serialize both tag and lemma, since this is what causes the bug
-    doc_array = doc.to_array(["TAG", "LEMMA"])
-    new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array)
-    assert [t.tag_ for t in new_doc] == tags
-    assert [t.lemma_ for t in new_doc] == lemmas
-
-
-@pytest.mark.issue(2219)
-def test_issue2219(en_vocab):
-    vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
-    add_vecs_to_vocab(en_vocab, vectors)
-    [(word1, vec1), (word2, vec2)] = vectors
-    doc = Doc(en_vocab, words=[word1, word2])
-    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
-
-
-@pytest.mark.issue(2361)
-def test_issue2361(de_vocab):
-    chars = ("&lt;", "&gt;", "&amp;", "&quot;")
-    words = ["<", ">", "&", '"']
-    doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
-    html = render(doc)
-    for char in chars:
-        assert char in html
-
-
-@pytest.mark.issue(2385)
-def test_issue2385():
-    """Test that IOB tags are correctly converted to BILUO tags."""
-    # fix bug in labels with a 'b' character
-    tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER")
-    assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"]
-    # maintain support for iob1 format
-    tags2 = ("I-ORG", "I-ORG", "B-ORG")
-    assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"]
-    # maintain support for iob2 format
-    tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
-    assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
-
-
-@pytest.mark.parametrize(
-    "tags",
-    [
-        ("B-ORG", "L-ORG"),
-        ("B-PERSON", "I-PERSON", "L-PERSON"),
-        ("U-BRAWLER", "U-BRAWLER"),
-    ],
-)
-@pytest.mark.issue(2385)
-def test_issue2385_biluo(tags):
-    """Test that BILUO-compatible tags aren't modified."""
-    assert iob_to_biluo(tags) == list(tags)
-
-
-@pytest.mark.issue(2396)
-def test_issue2396(en_vocab):
-    words = ["She", "created", "a", "test", "for", "spacy"]
-    heads = [1, 1, 3, 1, 3, 4]
-    deps = ["dep"] * len(heads)
-    matrix = numpy.array(
-        [
-            [0, 1, 1, 1, 1, 1],
-            [1, 1, 1, 1, 1, 1],
-            [1, 1, 2, 3, 3, 3],
-            [1, 1, 3, 3, 3, 3],
-            [1, 1, 3, 3, 4, 4],
-            [1, 1, 3, 3, 4, 5],
-        ],
-        dtype=numpy.int32,
-    )
-    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
-    span = doc[:]
-    assert (doc.get_lca_matrix() == matrix).all()
-    assert (span.get_lca_matrix() == matrix).all()
-
-
-@pytest.mark.issue(2464)
-def test_issue2464(en_vocab):
-    """Test problem with successive ?. This is the same bug, so putting it here."""
-    matcher = Matcher(en_vocab)
-    doc = Doc(en_vocab, words=["a", "b"])
-    matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]])
-    matches = matcher(doc)
-    assert len(matches) == 3
-
-
-@pytest.mark.issue(2482)
-def test_issue2482():
-    """Test we can serialize and deserialize a blank NER or parser model."""
-    nlp = Italian()
-    nlp.add_pipe("ner")
-    b = nlp.to_bytes()
-    Italian().from_bytes(b)
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
deleted file mode 100644
index cbb7f0621..000000000
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ /dev/null
@@ -1,238 +0,0 @@
-import pytest
-from spacy import displacy
-from spacy.training import Example
-from spacy.lang.en import English
-from spacy.lang.ja import Japanese
-from spacy.lang.xx import MultiLanguage
-from spacy.language import Language
-from spacy.matcher import Matcher
-from spacy.tokens import Doc, Span
-from spacy.vocab import Vocab
-from spacy.compat import pickle
-import numpy
-import random
-
-
-@pytest.mark.issue(2564)
-def test_issue2564():
-    """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
-    nlp = Language()
-    tagger = nlp.add_pipe("tagger")
-    tagger.add_label("A")
-    nlp.initialize()
-    doc = nlp("hello world")
-    assert doc.has_annotation("TAG")
-    docs = nlp.pipe(["hello", "world"])
-    piped_doc = next(docs)
-    assert piped_doc.has_annotation("TAG")
-
-
-@pytest.mark.issue(2569)
-def test_issue2569(en_tokenizer):
-    """Test that operator + is greedy."""
-    doc = en_tokenizer("It is May 15, 1993.")
-    doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
-    matcher = Matcher(doc.vocab)
-    matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]])
-    matched = [doc[start:end] for _, start, end in matcher(doc)]
-    matched = sorted(matched, key=len, reverse=True)
-    assert len(matched) == 10
-    assert len(matched[0]) == 4
-    assert matched[0].text == "May 15, 1993"
-
-
-@pytest.mark.parametrize(
-    "text",
-    [
-        "ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume",
-        "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:",
-    ],
-)
-@pytest.mark.issue(2626)
-def test_issue2626_2835(en_tokenizer, text):
-    """Check that sentence doesn't cause an infinite loop in the tokenizer."""
-    doc = en_tokenizer(text)
-    assert doc
-
-
-@pytest.mark.issue(2656)
-def test_issue2656(en_tokenizer):
-    """Test that tokenizer correctly splits off punctuation after numbers with
-    decimal points.
-    """
-    doc = en_tokenizer("I went for 40.3, and got home by 10.0.")
-    assert len(doc) == 11
-    assert doc[0].text == "I"
-    assert doc[1].text == "went"
-    assert doc[2].text == "for"
-    assert doc[3].text == "40.3"
-    assert doc[4].text == ","
-    assert doc[5].text == "and"
-    assert doc[6].text == "got"
-    assert doc[7].text == "home"
-    assert doc[8].text == "by"
-    assert doc[9].text == "10.0"
-    assert doc[10].text == "."
-
-
-@pytest.mark.issue(2671)
-def test_issue2671():
-    """Ensure the correct entity ID is returned for matches with quantifiers.
-    See also #2675
-    """
-    nlp = English()
-    matcher = Matcher(nlp.vocab)
-    pattern_id = "test_pattern"
-    pattern = [
-        {"LOWER": "high"},
-        {"IS_PUNCT": True, "OP": "?"},
-        {"LOWER": "adrenaline"},
-    ]
-    matcher.add(pattern_id, [pattern])
-    doc1 = nlp("This is a high-adrenaline situation.")
-    doc2 = nlp("This is a high adrenaline situation.")
-    matches1 = matcher(doc1)
-    for match_id, start, end in matches1:
-        assert nlp.vocab.strings[match_id] == pattern_id
-    matches2 = matcher(doc2)
-    for match_id, start, end in matches2:
-        assert nlp.vocab.strings[match_id] == pattern_id
-
-
-@pytest.mark.issue(2728)
-def test_issue2728(en_vocab):
-    """Test that displaCy ENT visualizer escapes HTML correctly."""
-    doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
-    doc.ents = [Span(doc, 0, 1, label="TEST")]
-    html = displacy.render(doc, style="ent")
-    assert "&lt;RELEASE&gt;" in html
-    doc.ents = [Span(doc, 1, 2, label="TEST")]
-    html = displacy.render(doc, style="ent")
-    assert "&lt;RELEASE&gt;" in html
-
-
-@pytest.mark.issue(2754)
-def test_issue2754(en_tokenizer):
-    """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
-    a = en_tokenizer("a")
-    assert a[0].norm_ == "a"
-    am = en_tokenizer("am")
-    assert am[0].norm_ == "am"
-
-
-@pytest.mark.issue(2772)
-def test_issue2772(en_vocab):
-    """Test that deprojectivization doesn't mess up sentence boundaries."""
-    # fmt: off
-    words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."]
-    # fmt: on
-    # A tree with a non-projective (i.e. crossing) arc
-    # The arcs (0, 4) and (2, 9) cross.
-    heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9]
-    deps = ["dep"] * len(heads)
-    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
-    assert doc[1].is_sent_start is False
-
-
-@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
-@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
-@pytest.mark.issue(2782)
-def test_issue2782(text, lang_cls):
-    """Check that like_num handles + and - before number."""
-    nlp = lang_cls()
-    doc = nlp(text)
-    assert len(doc) == 1
-    assert doc[0].like_num
-
-
-@pytest.mark.issue(2800)
-def test_issue2800():
-    """Test issue that arises when too many labels are added to NER model.
-    Used to cause segfault.
-    """
-    nlp = English()
-    train_data = []
-    train_data.extend(
-        [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
-    )
-    entity_types = [str(i) for i in range(1000)]
-    ner = nlp.add_pipe("ner")
-    for entity_type in list(entity_types):
-        ner.add_label(entity_type)
-    optimizer = nlp.initialize()
-    for i in range(20):
-        losses = {}
-        random.shuffle(train_data)
-        for example in train_data:
-            nlp.update([example], sgd=optimizer, losses=losses, drop=0.5)
-
-
-@pytest.mark.issue(2822)
-def test_issue2822(it_tokenizer):
-    """Test that the abbreviation of poco is kept as one word."""
-    doc = it_tokenizer("Vuoi un po' di zucchero?")
-    assert len(doc) == 6
-    assert doc[0].text == "Vuoi"
-    assert doc[1].text == "un"
-    assert doc[2].text == "po'"
-    assert doc[3].text == "di"
-    assert doc[4].text == "zucchero"
-    assert doc[5].text == "?"
-
-
-@pytest.mark.issue(2833)
-def test_issue2833(en_vocab):
-    """Test that a custom error is raised if a token or span is pickled."""
-    doc = Doc(en_vocab, words=["Hello", "world"])
-    with pytest.raises(NotImplementedError):
-        pickle.dumps(doc[0])
-    with pytest.raises(NotImplementedError):
-        pickle.dumps(doc[0:2])
-
-
-@pytest.mark.issue(2871)
-def test_issue2871():
-    """Test that vectors recover the correct key for spaCy reserved words."""
-    words = ["dog", "cat", "SUFFIX"]
-    vocab = Vocab(vectors_name="test_issue2871")
-    vocab.vectors.resize(shape=(3, 10))
-    vector_data = numpy.zeros((3, 10), dtype="f")
-    for word in words:
-        _ = vocab[word]  # noqa: F841
-        vocab.set_vector(word, vector_data[0])
-    vocab.vectors.name = "dummy_vectors"
-    assert vocab["dog"].rank == 0
-    assert vocab["cat"].rank == 1
-    assert vocab["SUFFIX"].rank == 2
-    assert vocab.vectors.find(key="dog") == 0
-    assert vocab.vectors.find(key="cat") == 1
-    assert vocab.vectors.find(key="SUFFIX") == 2
-
-
-@pytest.mark.issue(2901)
-def test_issue2901():
-    """Test that `nlp` doesn't fail."""
-    try:
-        nlp = Japanese()
-    except ImportError:
-        pytest.skip()
-
-    doc = nlp("pythonが大好きです")
-    assert doc
-
-
-@pytest.mark.issue(2926)
-def test_issue2926(fr_tokenizer):
-    """Test that the tokenizer correctly splits tokens separated by a slash (/)
-    ending in a digit.
-    """
-    doc = fr_tokenizer("Learn html5/css3/javascript/jquery")
-    assert len(doc) == 8
-    assert doc[0].text == "Learn"
-    assert doc[1].text == "html5"
-    assert doc[2].text == "/"
-    assert doc[3].text == "css3"
-    assert doc[4].text == "/"
-    assert doc[5].text == "javascript"
-    assert doc[6].text == "/"
-    assert doc[7].text == "jquery"
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
deleted file mode 100644
index 6220003dc..000000000
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ /dev/null
@@ -1,272 +0,0 @@
-import pytest
-from spacy import registry
-from spacy.lang.en import English
-from spacy.lang.de import German
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
-from spacy.pipeline import EntityRuler, EntityRecognizer
-from spacy.matcher import Matcher, PhraseMatcher
-from spacy.tokens import Doc
-from spacy.vocab import Vocab
-from spacy.attrs import ENT_IOB, ENT_TYPE
-from spacy.compat import pickle
-from spacy import displacy
-from spacy.vectors import Vectors
-import numpy
-
-
-@pytest.mark.issue(3002)
-def test_issue3002():
-    """Test that the tokenizer doesn't hang on a long list of dots"""
-    nlp = German()
-    doc = nlp(
-        "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
-    )
-    assert len(doc) == 5
-
-
-@pytest.mark.issue(3009)
-def test_issue3009(en_vocab):
-    """Test problem with matcher quantifiers"""
-    patterns = [
-        [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
-        [
-            {"ORTH": "has"},
-            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
-            {"LOWER": "to"},
-            {"LOWER": "do"},
-            {"TAG": "IN"},
-        ],
-        [
-            {"ORTH": "has"},
-            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
-            {"LOWER": "to"},
-            {"LOWER": "do"},
-            {"TAG": "IN"},
-        ],
-    ]
-    words = ["also", "has", "to", "do", "with"]
-    tags = ["RB", "VBZ", "TO", "VB", "IN"]
-    pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
-    doc = Doc(en_vocab, words=words, tags=tags, pos=pos)
-    matcher = Matcher(en_vocab)
-    for i, pattern in enumerate(patterns):
-        matcher.add(str(i), [pattern])
-        matches = matcher(doc)
-        assert matches
-
-
-@pytest.mark.issue(3012)
-def test_issue3012(en_vocab):
-    """Test that the is_tagged attribute doesn't get overwritten when we from_array
-    without tag information."""
-    words = ["This", "is", "10", "%", "."]
-    tags = ["DT", "VBZ", "CD", "NN", "."]
-    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
-    ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
-    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
-    assert doc.has_annotation("TAG")
-    expected = ("10", "NUM", "CD", "PERCENT")
-    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
-    header = [ENT_IOB, ENT_TYPE]
-    ent_array = doc.to_array(header)
-    doc.from_array(header, ent_array)
-    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
-    # Serializing then deserializing
-    doc_bytes = doc.to_bytes()
-    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
-    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
-
-
-@pytest.mark.issue(3199)
-def test_issue3199():
-    """Test that Span.noun_chunks works correctly if no noun chunks iterator
-    is available. To make this test future-proof, we're constructing a Doc
-    with a new Vocab here and a parse tree to make sure the noun chunks run.
-    """
-    words = ["This", "is", "a", "sentence"]
-    doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
-    with pytest.raises(NotImplementedError):
-        list(doc[0:3].noun_chunks)
-
-
-@pytest.mark.issue(3209)
-def test_issue3209():
-    """Test issue that occurred in spaCy nightly where NER labels were being
-    mapped to classes incorrectly after loading the model, when the labels
-    were added using ner.add_label().
-    """
-    nlp = English()
-    ner = nlp.add_pipe("ner")
-    ner.add_label("ANIMAL")
-    nlp.initialize()
-    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
-    assert ner.move_names == move_names
-    nlp2 = English()
-    ner2 = nlp2.add_pipe("ner")
-    model = ner2.model
-    model.attrs["resize_output"](model, ner.moves.n_moves)
-    nlp2.from_bytes(nlp.to_bytes())
-    assert ner2.move_names == move_names
-
-
-@pytest.mark.issue(3248)
-def test_issue3248_1():
-    """Test that the PhraseMatcher correctly reports its number of rules, not
-    total number of patterns."""
-    nlp = English()
-    matcher = PhraseMatcher(nlp.vocab)
-    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
-    matcher.add("TEST2", [nlp("d")])
-    assert len(matcher) == 2
-
-
-@pytest.mark.issue(3248)
-def test_issue3248_2():
-    """Test that the PhraseMatcher can be pickled correctly."""
-    nlp = English()
-    matcher = PhraseMatcher(nlp.vocab)
-    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
-    matcher.add("TEST2", [nlp("d")])
-    data = pickle.dumps(matcher)
-    new_matcher = pickle.loads(data)
-    assert len(new_matcher) == len(matcher)
-
-
-@pytest.mark.issue(3277)
-def test_issue3277(es_tokenizer):
-    """Test that hyphens are split correctly as prefixes."""
-    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
-    assert len(doc) == 14
-    assert doc[0].text == "\u2014"
-    assert doc[5].text == "\u2013"
-    assert doc[9].text == "\u2013"
-
-
-@pytest.mark.issue(3288)
-def test_issue3288(en_vocab):
-    """Test that retokenization works correctly via displaCy when punctuation
-    is merged onto the preceeding token and tensor is resized."""
-    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
-    heads = [1, 1, 1, 4, 4, 6, 4, 4]
-    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
-    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
-    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
-    displacy.render(doc)
-
-
-@pytest.mark.issue(3289)
-def test_issue3289():
-    """Test that Language.to_bytes handles serializing a pipeline component
-    with an uninitialized model."""
-    nlp = English()
-    nlp.add_pipe("textcat")
-    bytes_data = nlp.to_bytes()
-    new_nlp = English()
-    new_nlp.add_pipe("textcat")
-    new_nlp.from_bytes(bytes_data)
-
-
-@pytest.mark.issue(3328)
-def test_issue3328(en_vocab):
-    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
-    matcher = Matcher(en_vocab)
-    patterns = [
-        [{"LOWER": {"IN": ["hello", "how"]}}],
-        [{"LOWER": {"IN": ["you", "doing"]}}],
-    ]
-    matcher.add("TEST", patterns)
-    matches = matcher(doc)
-    assert len(matches) == 4
-    matched_texts = [doc[start:end].text for _, start, end in matches]
-    assert matched_texts == ["Hello", "how", "you", "doing"]
-
-
-@pytest.mark.issue(3331)
-def test_issue3331(en_vocab):
-    """Test that duplicate patterns for different rules result in multiple
-    matches, one per rule.
-    """
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
-    matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
-    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
-    matches = matcher(doc)
-    assert len(matches) == 2
-    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
-    assert sorted(match_ids) == ["A", "B"]
-
-
-@pytest.mark.issue(3345)
-def test_issue3345():
-    """Test case where preset entity crosses sentence boundary."""
-    nlp = English()
-    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
-    doc[4].is_sent_start = True
-    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
-    cfg = {"model": DEFAULT_NER_MODEL}
-    model = registry.resolve(cfg, validate=True)["model"]
-    ner = EntityRecognizer(doc.vocab, model)
-    # Add the OUT action. I wouldn't have thought this would be necessary...
-    ner.moves.add_action(5, "")
-    ner.add_label("GPE")
-    doc = ruler(doc)
-    # Get into the state just before "New"
-    state = ner.moves.init_batch([doc])[0]
-    ner.moves.apply_transition(state, "O")
-    ner.moves.apply_transition(state, "O")
-    ner.moves.apply_transition(state, "O")
-    # Check that B-GPE is valid.
-    assert ner.moves.is_valid(state, "B-GPE")
-
-
-@pytest.mark.issue(3412)
-def test_issue3412():
-    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
-    vectors = Vectors(data=data, keys=["A", "B", "C"])
-    keys, best_rows, scores = vectors.most_similar(
-        numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
-    )
-    assert best_rows[0] == 2
-
-
-@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
-@pytest.mark.issue(3449)
-def test_issue3449():
-    nlp = English()
-    nlp.add_pipe("sentencizer")
-    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
-    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
-    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
-    t1 = nlp(text1)
-    t2 = nlp(text2)
-    t3 = nlp(text3)
-    assert t1[5].text == "I"
-    assert t2[5].text == "I"
-    assert t3[5].text == "I"
-
-
-@pytest.mark.issue(3456)
-def test_issue3456():
-    # this crashed because of a padding error in layer.ops.unflatten in thinc
-    nlp = English()
-    tagger = nlp.add_pipe("tagger")
-    tagger.add_label("A")
-    nlp.initialize()
-    list(nlp.pipe(["hi", ""]))
-
-
-@pytest.mark.issue(3468)
-def test_issue3468():
-    """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
-    be restored after serialization."""
-    nlp = English()
-    nlp.add_pipe("sentencizer")
-    doc = nlp("Hello world")
-    assert doc[0].is_sent_start
-    assert doc.has_annotation("SENT_START")
-    assert len(list(doc.sents)) == 1
-    doc_bytes = doc.to_bytes()
-    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
-    assert new_doc[0].is_sent_start
-    assert new_doc.has_annotation("SENT_START")
-    assert len(list(new_doc.sents)) == 1
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
deleted file mode 100644
index 5d9bc4e83..000000000
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ /dev/null
@@ -1,492 +0,0 @@
-import pytest
-from spacy.language import Language
-from spacy.vocab import Vocab
-from spacy.pipeline import EntityRuler, DependencyParser
-from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
-from spacy import displacy, load
-from spacy.displacy import parse_deps
-from spacy.tokens import Doc, Token
-from spacy.matcher import Matcher, PhraseMatcher
-from spacy.errors import MatchPatternError
-from spacy.util import minibatch
-from spacy.training import Example
-from spacy.lang.hi import Hindi
-from spacy.lang.es import Spanish
-from spacy.lang.en import English
-from spacy.attrs import IS_ALPHA
-from spacy import registry
-from thinc.api import compounding
-import spacy
-import srsly
-import numpy
-
-from ..util import make_tempdir
-
-
-@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
-@pytest.mark.issue(3521)
-def test_issue3521(en_tokenizer, word):
-    tok = en_tokenizer(word)[1]
-    # 'not' and 'would' should be stopwords, also in their abbreviated forms
-    assert tok.is_stop
-
-
-def test_issue_3526_1(en_vocab):
-    patterns = [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    ruler_bytes = ruler.to_bytes()
-    assert len(ruler) == len(patterns)
-    assert len(ruler.labels) == 4
-    assert ruler.overwrite
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(ruler_bytes)
-    assert len(new_ruler) == len(ruler)
-    assert len(new_ruler.labels) == 4
-    assert new_ruler.overwrite == ruler.overwrite
-    assert new_ruler.ent_id_sep == ruler.ent_id_sep
-
-
-def test_issue_3526_2(en_vocab):
-    patterns = [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(bytes_old_style)
-    assert len(new_ruler) == len(ruler)
-    for pattern in ruler.patterns:
-        assert pattern in new_ruler.patterns
-    assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_issue_3526_3(en_vocab):
-    patterns = [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    with make_tempdir() as tmpdir:
-        out_file = tmpdir / "entity_ruler"
-        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
-        new_ruler = EntityRuler(nlp).from_disk(out_file)
-        for pattern in ruler.patterns:
-            assert pattern in new_ruler.patterns
-        assert len(new_ruler) == len(ruler)
-        assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_issue_3526_4(en_vocab):
-    nlp = Language(vocab=en_vocab)
-    patterns = [{"label": "ORG", "pattern": "Apple"}]
-    config = {"overwrite_ents": True}
-    ruler = nlp.add_pipe("entity_ruler", config=config)
-    ruler.add_patterns(patterns)
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir)
-        ruler = nlp.get_pipe("entity_ruler")
-        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert ruler.overwrite is True
-        nlp2 = load(tmpdir)
-        new_ruler = nlp2.get_pipe("entity_ruler")
-        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert new_ruler.overwrite is True
-
-
-@pytest.mark.issue(3531)
-def test_issue3531():
-    """Test that displaCy renderer doesn't require "settings" key."""
-    example_dep = {
-        "words": [
-            {"text": "But", "tag": "CCONJ"},
-            {"text": "Google", "tag": "PROPN"},
-            {"text": "is", "tag": "VERB"},
-            {"text": "starting", "tag": "VERB"},
-            {"text": "from", "tag": "ADP"},
-            {"text": "behind.", "tag": "ADV"},
-        ],
-        "arcs": [
-            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
-            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
-            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
-            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
-            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
-        ],
-    }
-    example_ent = {
-        "text": "But Google is starting from behind.",
-        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
-    }
-    dep_html = displacy.render(example_dep, style="dep", manual=True)
-    assert dep_html
-    ent_html = displacy.render(example_ent, style="ent", manual=True)
-    assert ent_html
-
-
-@pytest.mark.issue(3540)
-def test_issue3540(en_vocab):
-    words = ["I", "live", "in", "NewYork", "right", "now"]
-    tensor = numpy.asarray(
-        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
-        dtype="f",
-    )
-    doc = Doc(en_vocab, words=words)
-    doc.tensor = tensor
-    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
-    assert [token.text for token in doc] == gold_text
-    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
-    for i, lemma in enumerate(gold_lemma):
-        doc[i].lemma_ = lemma
-    assert [token.lemma_ for token in doc] == gold_lemma
-    vectors_1 = [token.vector for token in doc]
-    assert len(vectors_1) == len(doc)
-
-    with doc.retokenize() as retokenizer:
-        heads = [(doc[3], 1), doc[2]]
-        attrs = {
-            "POS": ["PROPN", "PROPN"],
-            "LEMMA": ["New", "York"],
-            "DEP": ["pobj", "compound"],
-        }
-        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
-
-    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
-    assert [token.text for token in doc] == gold_text
-    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
-    assert [token.lemma_ for token in doc] == gold_lemma
-    vectors_2 = [token.vector for token in doc]
-    assert len(vectors_2) == len(doc)
-    assert vectors_1[0].tolist() == vectors_2[0].tolist()
-    assert vectors_1[1].tolist() == vectors_2[1].tolist()
-    assert vectors_1[2].tolist() == vectors_2[2].tolist()
-    assert vectors_1[4].tolist() == vectors_2[5].tolist()
-    assert vectors_1[5].tolist() == vectors_2[6].tolist()
-
-
-@pytest.mark.issue(3549)
-def test_issue3549(en_vocab):
-    """Test that match pattern validation doesn't raise on empty errors."""
-    matcher = Matcher(en_vocab, validate=True)
-    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
-    matcher.add("GOOD", [pattern])
-    with pytest.raises(MatchPatternError):
-        matcher.add("BAD", [[{"X": "Y"}]])
-
-
-@pytest.mark.skip("Matching currently only works on strings and integers")
-@pytest.mark.issue(3555)
-def test_issue3555(en_vocab):
-    """Test that custom extensions with default None don't break matcher."""
-    Token.set_extension("issue3555", default=None)
-    matcher = Matcher(en_vocab)
-    pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["have", "apple"])
-    matcher(doc)
-
-
-@pytest.mark.issue(3611)
-def test_issue3611():
-    """Test whether adding n-grams in the textcat works even when n > token length of some docs"""
-    unique_classes = ["offensive", "inoffensive"]
-    x_train = [
-        "This is an offensive text",
-        "This is the second offensive text",
-        "inoff",
-    ]
-    y_train = ["offensive", "offensive", "inoffensive"]
-    nlp = spacy.blank("en")
-    # preparing the data
-    train_data = []
-    for text, train_instance in zip(x_train, y_train):
-        cat_dict = {label: label == train_instance for label in unique_classes}
-        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
-    # add a text categorizer component
-    model = {
-        "@architectures": "spacy.TextCatBOW.v1",
-        "exclusive_classes": True,
-        "ngram_size": 2,
-        "no_output_layer": False,
-    }
-    textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
-    for label in unique_classes:
-        textcat.add_label(label)
-    # training the network
-    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.initialize()
-        for i in range(3):
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
-            for batch in batches:
-                nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
-
-
-@pytest.mark.issue(3625)
-def test_issue3625():
-    """Test that default punctuation rules applies to hindi unicode characters"""
-    nlp = Hindi()
-    doc = nlp("hi. how हुए. होटल, होटल")
-    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
-    assert [token.text for token in doc] == expected
-
-
-@pytest.mark.issue(3803)
-def test_issue3803():
-    """Test that spanish num-like tokens have True for like_num attribute."""
-    nlp = Spanish()
-    text = "2 dos 1000 mil 12 doce"
-    doc = nlp(text)
-
-    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
-
-
-def _parser_example(parser):
-    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
-    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
-    return Example.from_dict(doc, gold)
-
-
-@pytest.mark.issue(3830)
-def test_issue3830_no_subtok():
-    """Test that the parser doesn't have subtok label if not learn_tokens"""
-    config = {
-        "learn_tokens": False,
-    }
-    model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
-    parser = DependencyParser(Vocab(), model, **config)
-    parser.add_label("nsubj")
-    assert "subtok" not in parser.labels
-    parser.initialize(lambda: [_parser_example(parser)])
-    assert "subtok" not in parser.labels
-
-
-@pytest.mark.issue(3830)
-def test_issue3830_with_subtok():
-    """Test that the parser does have subtok label if learn_tokens=True."""
-    config = {
-        "learn_tokens": True,
-    }
-    model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
-    parser = DependencyParser(Vocab(), model, **config)
-    parser.add_label("nsubj")
-    assert "subtok" not in parser.labels
-    parser.initialize(lambda: [_parser_example(parser)])
-    assert "subtok" in parser.labels
-
-
-@pytest.mark.issue(3839)
-def test_issue3839(en_vocab):
-    """Test that match IDs returned by the matcher are correct, are in the string"""
-    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
-    matcher = Matcher(en_vocab)
-    match_id = "PATTERN"
-    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
-    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
-    matcher.add(match_id, [pattern1])
-    matches = matcher(doc)
-    assert matches[0][0] == en_vocab.strings[match_id]
-    matcher = Matcher(en_vocab)
-    matcher.add(match_id, [pattern2])
-    matches = matcher(doc)
-    assert matches[0][0] == en_vocab.strings[match_id]
-
-
-@pytest.mark.parametrize(
-    "sentence",
-    [
-        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
-        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
-        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
-        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
-        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
-    ],
-)
-@pytest.mark.issue(3869)
-def test_issue3869(sentence):
-    """Test that the Doc's count_by function works consistently"""
-    nlp = English()
-    doc = nlp(sentence)
-    count = 0
-    for token in doc:
-        count += token.is_alpha
-    assert count == doc.count_by(IS_ALPHA).get(1, 0)
-
-
-@pytest.mark.issue(3879)
-def test_issue3879(en_vocab):
-    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
-    assert len(doc) == 5
-    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [pattern])
-    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
-
-
-@pytest.mark.issue(3880)
-def test_issue3880():
-    """Test that `nlp.pipe()` works when an empty string ends the batch.
-
-    Fixed in v7.0.5 of Thinc.
-    """
-    texts = ["hello", "world", "", ""]
-    nlp = English()
-    nlp.add_pipe("parser").add_label("dep")
-    nlp.add_pipe("ner").add_label("PERSON")
-    nlp.add_pipe("tagger").add_label("NN")
-    nlp.initialize()
-    for doc in nlp.pipe(texts):
-        pass
-
-
-@pytest.mark.issue(3882)
-def test_issue3882(en_vocab):
-    """Test that displaCy doesn't serialize the doc.user_data when making a
-    copy of the Doc.
-    """
-    doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
-    doc.user_data["test"] = set()
-    parse_deps(doc)
-
-
-@pytest.mark.issue(3951)
-def test_issue3951(en_vocab):
-    """Test that combinations of optional rules are matched correctly."""
-    matcher = Matcher(en_vocab)
-    pattern = [
-        {"LOWER": "hello"},
-        {"LOWER": "this", "OP": "?"},
-        {"OP": "?"},
-        {"LOWER": "world"},
-    ]
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
-    matches = matcher(doc)
-    assert len(matches) == 0
-
-
-@pytest.mark.issue(3959)
-def test_issue3959():
-    """Ensure that a modified pos attribute is serialized correctly."""
-    nlp = English()
-    doc = nlp(
-        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
-    )
-    assert doc[0].pos_ == ""
-    doc[0].pos_ = "NOUN"
-    assert doc[0].pos_ == "NOUN"
-    # usually this is already True when starting from proper models instead of blank English
-    with make_tempdir() as tmp_dir:
-        file_path = tmp_dir / "my_doc"
-        doc.to_disk(file_path)
-        doc2 = nlp("")
-        doc2.from_disk(file_path)
-        assert doc2[0].pos_ == "NOUN"
-
-
-@pytest.mark.issue(3962)
-def test_issue3962(en_vocab):
-    """Ensure that as_doc does not result in out-of-bound access of tokens.
-    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
-    # fmt: off
-    words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
-    heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7]
-    deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
-    # fmt: on
-    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
-    span2 = doc[1:5]  # "jests at scars ,"
-    doc2 = span2.as_doc()
-    doc2_json = doc2.to_json()
-    assert doc2_json
-    # head set to itself, being the new artificial root
-    assert doc2[0].head.text == "jests"
-    assert doc2[0].dep_ == "dep"
-    assert doc2[1].head.text == "jests"
-    assert doc2[1].dep_ == "prep"
-    assert doc2[2].head.text == "at"
-    assert doc2[2].dep_ == "pobj"
-    assert doc2[3].head.text == "jests"  # head set to the new artificial root
-    assert doc2[3].dep_ == "dep"
-    # We should still have 1 sentence
-    assert len(list(doc2.sents)) == 1
-    span3 = doc[6:9]  # "never felt a"
-    doc3 = span3.as_doc()
-    doc3_json = doc3.to_json()
-    assert doc3_json
-    assert doc3[0].head.text == "felt"
-    assert doc3[0].dep_ == "neg"
-    assert doc3[1].head.text == "felt"
-    assert doc3[1].dep_ == "ROOT"
-    assert doc3[2].head.text == "felt"  # head set to ancestor
-    assert doc3[2].dep_ == "dep"
-    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
-    assert len(list(doc3.sents)) == 1
-
-
-@pytest.mark.issue(3962)
-def test_issue3962_long(en_vocab):
-    """Ensure that as_doc does not result in out-of-bound access of tokens.
-    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
-    # fmt: off
-    words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
-    heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7]
-    deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
-    # fmt: on
-    two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
-    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
-    doc2 = span2.as_doc()
-    doc2_json = doc2.to_json()
-    assert doc2_json
-    # head set to itself, being the new artificial root (in sentence 1)
-    assert doc2[0].head.text == "jests"
-    assert doc2[0].dep_ == "ROOT"
-    assert doc2[1].head.text == "jests"
-    assert doc2[1].dep_ == "prep"
-    assert doc2[2].head.text == "at"
-    assert doc2[2].dep_ == "pobj"
-    assert doc2[3].head.text == "jests"
-    assert doc2[3].dep_ == "punct"
-    # head set to itself, being the new artificial root (in sentence 2)
-    assert doc2[4].head.text == "They"
-    assert doc2[4].dep_ == "dep"
-    # head set to the new artificial head (in sentence 2)
-    assert doc2[4].head.text == "They"
-    assert doc2[4].dep_ == "dep"
-    # We should still have 2 sentences
-    sents = list(doc2.sents)
-    assert len(sents) == 2
-    assert sents[0].text == "jests at scars ."
-    assert sents[1].text == "They never"
-
-
-@pytest.mark.issue(3972)
-def test_issue3972(en_vocab):
-    """Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
-    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
-    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
-    matches = matcher(doc)
-
-    assert len(matches) == 2
-
-    # We should have a match for each of the two rules
-    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
-    assert "A" in found_ids
-    assert "B" in found_ids
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
deleted file mode 100644
index 7b7c304a3..000000000
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ /dev/null
@@ -1,447 +0,0 @@
-import pytest
-from spacy.pipeline import TrainablePipe
-from spacy.matcher import PhraseMatcher, Matcher
-from spacy.tokens import Doc, Span, DocBin
-from spacy.training import Example, Corpus
-from spacy.training.converters import json_to_docs
-from spacy.vocab import Vocab
-from spacy.lang.en import English
-from spacy.util import minibatch, ensure_path, load_model
-from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
-from spacy.tokenizer import Tokenizer
-from spacy.lang.el import Greek
-from spacy.language import Language
-import spacy
-from thinc.api import compounding
-
-from ..util import make_tempdir
-
-
-@pytest.mark.issue(4002)
-def test_issue4002(en_vocab):
-    """Test that the PhraseMatcher can match on overwritten NORM attributes."""
-    matcher = PhraseMatcher(en_vocab, attr="NORM")
-    pattern1 = Doc(en_vocab, words=["c", "d"])
-    assert [t.norm_ for t in pattern1] == ["c", "d"]
-    matcher.add("TEST", [pattern1])
-    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
-    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
-    matches = matcher(doc)
-    assert len(matches) == 1
-    matcher = PhraseMatcher(en_vocab, attr="NORM")
-    pattern2 = Doc(en_vocab, words=["1", "2"])
-    pattern2[0].norm_ = "c"
-    pattern2[1].norm_ = "d"
-    assert [t.norm_ for t in pattern2] == ["c", "d"]
-    matcher.add("TEST", [pattern2])
-    matches = matcher(doc)
-    assert len(matches) == 1
-
-
-@pytest.mark.issue(4030)
-def test_issue4030():
-    """Test whether textcat works fine with empty doc"""
-    unique_classes = ["offensive", "inoffensive"]
-    x_train = [
-        "This is an offensive text",
-        "This is the second offensive text",
-        "inoff",
-    ]
-    y_train = ["offensive", "offensive", "inoffensive"]
-    nlp = spacy.blank("en")
-    # preparing the data
-    train_data = []
-    for text, train_instance in zip(x_train, y_train):
-        cat_dict = {label: label == train_instance for label in unique_classes}
-        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
-    # add a text categorizer component
-    model = {
-        "@architectures": "spacy.TextCatBOW.v1",
-        "exclusive_classes": True,
-        "ngram_size": 2,
-        "no_output_layer": False,
-    }
-    textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
-    for label in unique_classes:
-        textcat.add_label(label)
-    # training the network
-    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.initialize()
-        for i in range(3):
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
-            for batch in batches:
-                nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
-    # processing of an empty doc should result in 0.0 for all categories
-    doc = nlp("")
-    assert doc.cats["offensive"] == 0.0
-    assert doc.cats["inoffensive"] == 0.0
-
-
-@pytest.mark.issue(4042)
-def test_issue4042():
-    """Test that serialization of an EntityRuler before NER works fine."""
-    nlp = English()
-    # add ner pipe
-    ner = nlp.add_pipe("ner")
-    ner.add_label("SOME_LABEL")
-    nlp.initialize()
-    # Add entity ruler
-    patterns = [
-        {"label": "MY_ORG", "pattern": "Apple"},
-        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
-    ]
-    # works fine with "after"
-    ruler = nlp.add_pipe("entity_ruler", before="ner")
-    ruler.add_patterns(patterns)
-    doc1 = nlp("What do you think about Apple ?")
-    assert doc1.ents[0].label_ == "MY_ORG"
-
-    with make_tempdir() as d:
-        output_dir = ensure_path(d)
-        if not output_dir.exists():
-            output_dir.mkdir()
-        nlp.to_disk(output_dir)
-        nlp2 = load_model(output_dir)
-        doc2 = nlp2("What do you think about Apple ?")
-        assert doc2.ents[0].label_ == "MY_ORG"
-
-
-@pytest.mark.issue(4042)
-def test_issue4042_bug2():
-    """
-    Test that serialization of an NER works fine when new labels were added.
-    This is the second bug of two bugs underlying the issue 4042.
-    """
-    nlp1 = English()
-    # add ner pipe
-    ner1 = nlp1.add_pipe("ner")
-    ner1.add_label("SOME_LABEL")
-    nlp1.initialize()
-    # add a new label to the doc
-    doc1 = nlp1("What do you think about Apple ?")
-    assert len(ner1.labels) == 1
-    assert "SOME_LABEL" in ner1.labels
-    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
-    doc1.ents = list(doc1.ents) + [apple_ent]
-    # Add the label explicitly. Previously we didn't require this.
-    ner1.add_label("MY_ORG")
-    ner1(doc1)
-    assert len(ner1.labels) == 2
-    assert "SOME_LABEL" in ner1.labels
-    assert "MY_ORG" in ner1.labels
-    with make_tempdir() as d:
-        # assert IO goes fine
-        output_dir = ensure_path(d)
-        if not output_dir.exists():
-            output_dir.mkdir()
-        ner1.to_disk(output_dir)
-        config = {}
-        ner2 = nlp1.create_pipe("ner", config=config)
-        ner2.from_disk(output_dir)
-        assert len(ner2.labels) == 2
-
-
-@pytest.mark.issue(4054)
-def test_issue4054(en_vocab):
-    """Test that a new blank model can be made with a vocab from file,
-    and that serialization does not drop the language at any point."""
-    nlp1 = English()
-    vocab1 = nlp1.vocab
-    with make_tempdir() as d:
-        vocab_dir = ensure_path(d / "vocab")
-        if not vocab_dir.exists():
-            vocab_dir.mkdir()
-        vocab1.to_disk(vocab_dir)
-        vocab2 = Vocab().from_disk(vocab_dir)
-        nlp2 = spacy.blank("en", vocab=vocab2)
-        nlp_dir = ensure_path(d / "nlp")
-        if not nlp_dir.exists():
-            nlp_dir.mkdir()
-        nlp2.to_disk(nlp_dir)
-        nlp3 = load_model(nlp_dir)
-        assert nlp3.lang == "en"
-
-
-@pytest.mark.issue(4120)
-def test_issue4120(en_vocab):
-    """Test that matches without a final {OP: ?} token are returned."""
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
-    doc1 = Doc(en_vocab, words=["a"])
-    assert len(matcher(doc1)) == 1  # works
-    doc2 = Doc(en_vocab, words=["a", "b", "c"])
-    assert len(matcher(doc2)) == 2  # fixed
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
-    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
-    assert len(matcher(doc3)) == 2  # works
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
-    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
-    assert len(matcher(doc4)) == 3  # fixed
-
-
-@pytest.mark.issue(4133)
-def test_issue4133(en_vocab):
-    nlp = English()
-    vocab_bytes = nlp.vocab.to_bytes()
-    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
-    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
-    doc = Doc(en_vocab, words=words)
-    for i, token in enumerate(doc):
-        token.pos_ = pos[i]
-    # usually this is already True when starting from proper models instead of blank English
-    doc_bytes = doc.to_bytes()
-    vocab = Vocab()
-    vocab = vocab.from_bytes(vocab_bytes)
-    doc = Doc(vocab).from_bytes(doc_bytes)
-    actual = []
-    for token in doc:
-        actual.append(token.pos_)
-    assert actual == pos
-
-
-@pytest.mark.issue(4190)
-def test_issue4190():
-    def customize_tokenizer(nlp):
-        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
-        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
-        infix_re = compile_infix_regex(nlp.Defaults.infixes)
-        # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
-        exceptions = {
-            k: v
-            for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
-            if not (len(k) == 2 and k[1] == ".")
-        }
-        new_tokenizer = Tokenizer(
-            nlp.vocab,
-            exceptions,
-            prefix_search=prefix_re.search,
-            suffix_search=suffix_re.search,
-            infix_finditer=infix_re.finditer,
-            token_match=nlp.tokenizer.token_match,
-        )
-        nlp.tokenizer = new_tokenizer
-
-    test_string = "Test c."
-    # Load default language
-    nlp_1 = English()
-    doc_1a = nlp_1(test_string)
-    result_1a = [token.text for token in doc_1a]  # noqa: F841
-    # Modify tokenizer
-    customize_tokenizer(nlp_1)
-    doc_1b = nlp_1(test_string)
-    result_1b = [token.text for token in doc_1b]
-    # Save and Reload
-    with make_tempdir() as model_dir:
-        nlp_1.to_disk(model_dir)
-        nlp_2 = load_model(model_dir)
-    # This should be the modified tokenizer
-    doc_2 = nlp_2(test_string)
-    result_2 = [token.text for token in doc_2]
-    assert result_1b == result_2
-
-
-@pytest.mark.issue(4267)
-def test_issue4267():
-    """Test that running an entity_ruler after ner gives consistent results"""
-    nlp = English()
-    ner = nlp.add_pipe("ner")
-    ner.add_label("PEOPLE")
-    nlp.initialize()
-    assert "ner" in nlp.pipe_names
-    # assert that we have correct IOB annotations
-    doc1 = nlp("hi")
-    assert doc1.has_annotation("ENT_IOB")
-    for token in doc1:
-        assert token.ent_iob == 2
-    # add entity ruler and run again
-    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
-    assert "entity_ruler" in nlp.pipe_names
-    assert "ner" in nlp.pipe_names
-    # assert that we still have correct IOB annotations
-    doc2 = nlp("hi")
-    assert doc2.has_annotation("ENT_IOB")
-    for token in doc2:
-        assert token.ent_iob == 2
-
-
-@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab")
-@pytest.mark.issue(4272)
-def test_issue4272():
-    """Test that lookup table can be accessed from Token.lemma if no POS tags
-    are available."""
-    nlp = Greek()
-    doc = nlp("Χθες")
-    assert doc[0].lemma_
-
-
-def test_multiple_predictions():
-    class DummyPipe(TrainablePipe):
-        def __init__(self):
-            self.model = "dummy_model"
-
-        def predict(self, docs):
-            return ([1, 2, 3], [4, 5, 6])
-
-        def set_annotations(self, docs, scores):
-            return docs
-
-    nlp = Language()
-    doc = nlp.make_doc("foo")
-    dummy_pipe = DummyPipe()
-    dummy_pipe(doc)
-
-
-@pytest.mark.issue(4313)
-def test_issue4313():
-    """This should not crash or exit with some strange error code"""
-    beam_width = 16
-    beam_density = 0.0001
-    nlp = English()
-    config = {
-        "beam_width": beam_width,
-        "beam_density": beam_density,
-    }
-    ner = nlp.add_pipe("beam_ner", config=config)
-    ner.add_label("SOME_LABEL")
-    nlp.initialize()
-    # add a new label to the doc
-    doc = nlp("What do you think about Apple ?")
-    assert len(ner.labels) == 1
-    assert "SOME_LABEL" in ner.labels
-    apple_ent = Span(doc, 5, 6, label="MY_ORG")
-    doc.ents = list(doc.ents) + [apple_ent]
-
-    # ensure the beam_parse still works with the new label
-    docs = [doc]
-    ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density)
-    assert len(ner.labels) == 2
-    assert "MY_ORG" in ner.labels
-
-
-@pytest.mark.issue(4348)
-def test_issue4348():
-    """Test that training the tagger with empty data, doesn't throw errors"""
-    nlp = English()
-    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
-    TRAIN_DATA = [example, example]
-    tagger = nlp.add_pipe("tagger")
-    tagger.add_label("A")
-    optimizer = nlp.initialize()
-    for i in range(5):
-        losses = {}
-        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
-        for batch in batches:
-            nlp.update(batch, sgd=optimizer, losses=losses)
-
-
-@pytest.mark.issue(4367)
-def test_issue4367():
-    """Test that docbin init goes well"""
-    DocBin()
-    DocBin(attrs=["LEMMA"])
-    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
-
-
-@pytest.mark.issue(4373)
-def test_issue4373():
-    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
-    matcher = Matcher(Vocab())
-    assert isinstance(matcher.vocab, Vocab)
-    matcher = PhraseMatcher(Vocab())
-    assert isinstance(matcher.vocab, Vocab)
-
-
-@pytest.mark.issue(4402)
-def test_issue4402():
-    json_data = {
-        "id": 0,
-        "paragraphs": [
-            {
-                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
-                "sentences": [
-                    {
-                        "tokens": [
-                            {"id": 0, "orth": "How", "ner": "O"},
-                            {"id": 1, "orth": "should", "ner": "O"},
-                            {"id": 2, "orth": "I", "ner": "O"},
-                            {"id": 3, "orth": "cook", "ner": "O"},
-                            {"id": 4, "orth": "bacon", "ner": "O"},
-                            {"id": 5, "orth": "in", "ner": "O"},
-                            {"id": 6, "orth": "an", "ner": "O"},
-                            {"id": 7, "orth": "oven", "ner": "O"},
-                            {"id": 8, "orth": "?", "ner": "O"},
-                        ],
-                        "brackets": [],
-                    },
-                    {
-                        "tokens": [
-                            {"id": 9, "orth": "\n", "ner": "O"},
-                            {"id": 10, "orth": "I", "ner": "O"},
-                            {"id": 11, "orth": "'ve", "ner": "O"},
-                            {"id": 12, "orth": "heard", "ner": "O"},
-                            {"id": 13, "orth": "of", "ner": "O"},
-                            {"id": 14, "orth": "people", "ner": "O"},
-                            {"id": 15, "orth": "cooking", "ner": "O"},
-                            {"id": 16, "orth": "bacon", "ner": "O"},
-                            {"id": 17, "orth": "in", "ner": "O"},
-                            {"id": 18, "orth": "an", "ner": "O"},
-                            {"id": 19, "orth": "oven", "ner": "O"},
-                            {"id": 20, "orth": ".", "ner": "O"},
-                        ],
-                        "brackets": [],
-                    },
-                ],
-                "cats": [
-                    {"label": "baking", "value": 1.0},
-                    {"label": "not_baking", "value": 0.0},
-                ],
-            },
-            {
-                "raw": "What is the difference between white and brown eggs?\n",
-                "sentences": [
-                    {
-                        "tokens": [
-                            {"id": 0, "orth": "What", "ner": "O"},
-                            {"id": 1, "orth": "is", "ner": "O"},
-                            {"id": 2, "orth": "the", "ner": "O"},
-                            {"id": 3, "orth": "difference", "ner": "O"},
-                            {"id": 4, "orth": "between", "ner": "O"},
-                            {"id": 5, "orth": "white", "ner": "O"},
-                            {"id": 6, "orth": "and", "ner": "O"},
-                            {"id": 7, "orth": "brown", "ner": "O"},
-                            {"id": 8, "orth": "eggs", "ner": "O"},
-                            {"id": 9, "orth": "?", "ner": "O"},
-                        ],
-                        "brackets": [],
-                    },
-                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
-                ],
-                "cats": [
-                    {"label": "baking", "value": 0.0},
-                    {"label": "not_baking", "value": 1.0},
-                ],
-            },
-        ],
-    }
-    nlp = English()
-    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
-    with make_tempdir() as tmpdir:
-        output_file = tmpdir / "test4402.spacy"
-        docs = json_to_docs([json_data])
-        data = DocBin(docs=docs, attrs=attrs).to_bytes()
-        with output_file.open("wb") as file_:
-            file_.write(data)
-        reader = Corpus(output_file)
-        train_data = list(reader(nlp))
-        assert len(train_data) == 2
-
-        split_train_data = []
-        for eg in train_data:
-            split_train_data.extend(eg.split_sents())
-        assert len(split_train_data) == 4
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
deleted file mode 100644
index 07a00d2b7..000000000
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ /dev/null
@@ -1,266 +0,0 @@
-import pytest
-from spacy.tokens import Doc, Span, DocBin
-from spacy.training import Example
-from spacy.training.converters.conllu_to_docs import conllu_to_docs
-from spacy.lang.en import English
-from spacy.kb import KnowledgeBase
-from spacy.vocab import Vocab
-from spacy.language import Language
-from spacy.util import ensure_path, load_model_from_path
-import numpy
-import pickle
-from thinc.api import NumpyOps, get_current_ops
-
-from ..util import make_tempdir
-
-
-@pytest.mark.issue(4528)
-def test_issue4528(en_vocab):
-    """Test that user_data is correctly serialized in DocBin."""
-    doc = Doc(en_vocab, words=["hello", "world"])
-    doc.user_data["foo"] = "bar"
-    # This is how extension attribute values are stored in the user data
-    doc.user_data[("._.", "foo", None, None)] = "bar"
-    doc_bin = DocBin(store_user_data=True)
-    doc_bin.add(doc)
-    doc_bin_bytes = doc_bin.to_bytes()
-    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
-    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
-    assert new_doc.user_data["foo"] == "bar"
-    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
-
-
-@pytest.mark.parametrize(
-    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
-)
-def test_gold_misaligned(en_tokenizer, text, words):
-    doc = en_tokenizer(text)
-    Example.from_dict(doc, {"words": words})
-
-
-@pytest.mark.issue(4651)
-def test_issue4651_with_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialized correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
-    specified.
-    """
-    text = "Spacy is a python library for nlp"
-    nlp = English()
-    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
-    ruler.add_patterns(patterns)
-    doc = nlp(text)
-    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
-    nlp_reloaded = English()
-    with make_tempdir() as d:
-        file_path = d / "entityruler"
-        ruler.to_disk(file_path)
-        nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
-    doc_reloaded = nlp_reloaded(text)
-    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
-    assert res == res_reloaded
-
-
-@pytest.mark.issue(4651)
-def test_issue4651_without_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialized correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
-    not specified.
-    """
-    text = "Spacy is a python library for nlp"
-    nlp = English()
-    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
-    doc = nlp(text)
-    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
-    nlp_reloaded = English()
-    with make_tempdir() as d:
-        file_path = d / "entityruler"
-        ruler.to_disk(file_path)
-        nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
-    doc_reloaded = nlp_reloaded(text)
-    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
-    assert res == res_reloaded
-
-
-@pytest.mark.issue(4665)
-def test_issue4665():
-    """
-    conllu_to_docs should not raise an exception if the HEAD column contains an
-    underscore
-    """
-    input_data = """
-1	[	_	PUNCT	-LRB-	_	_	punct	_	_
-2	This	_	DET	DT	_	_	det	_	_
-3	killing	_	NOUN	NN	_	_	nsubj	_	_
-4	of	_	ADP	IN	_	_	case	_	_
-5	a	_	DET	DT	_	_	det	_	_
-6	respected	_	ADJ	JJ	_	_	amod	_	_
-7	cleric	_	NOUN	NN	_	_	nmod	_	_
-8	will	_	AUX	MD	_	_	aux	_	_
-9	be	_	AUX	VB	_	_	aux	_	_
-10	causing	_	VERB	VBG	_	_	root	_	_
-11	us	_	PRON	PRP	_	_	iobj	_	_
-12	trouble	_	NOUN	NN	_	_	dobj	_	_
-13	for	_	ADP	IN	_	_	case	_	_
-14	years	_	NOUN	NNS	_	_	nmod	_	_
-15	to	_	PART	TO	_	_	mark	_	_
-16	come	_	VERB	VB	_	_	acl	_	_
-17	.	_	PUNCT	.	_	_	punct	_	_
-18	]	_	PUNCT	-RRB-	_	_	punct	_	_
-"""
-    conllu_to_docs(input_data)
-
-
-@pytest.mark.issue(4674)
-def test_issue4674():
-    """Test that setting entities with overlapping identifiers does not mess up IO"""
-    nlp = English()
-    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
-    vector1 = [0.9, 1.1, 1.01]
-    vector2 = [1.8, 2.25, 2.01]
-    with pytest.warns(UserWarning):
-        kb.set_entities(
-            entity_list=["Q1", "Q1"],
-            freq_list=[32, 111],
-            vector_list=[vector1, vector2],
-        )
-    assert kb.get_size_entities() == 1
-    # dumping to file & loading back in
-    with make_tempdir() as d:
-        dir_path = ensure_path(d)
-        if not dir_path.exists():
-            dir_path.mkdir()
-        file_path = dir_path / "kb"
-        kb.to_disk(str(file_path))
-        kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
-        kb2.from_disk(str(file_path))
-    assert kb2.get_size_entities() == 1
-
-
-@pytest.mark.skip(reason="API change: disable just disables, new exclude arg")
-@pytest.mark.issue(4707)
-def test_issue4707():
-    """Tests that disabled component names are also excluded from nlp.from_disk
-    by default when loading a model.
-    """
-    nlp = English()
-    nlp.add_pipe("sentencizer")
-    nlp.add_pipe("entity_ruler")
-    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
-    exclude = ["tokenizer", "sentencizer"]
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir, exclude=exclude)
-        new_nlp = load_model_from_path(tmpdir, disable=exclude)
-    assert "sentencizer" not in new_nlp.pipe_names
-    assert "entity_ruler" in new_nlp.pipe_names
-
-
-@pytest.mark.issue(4725)
-def test_issue4725_1():
-    """Ensure the pickling of the NER goes well"""
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    nlp = English(vocab=vocab)
-    config = {
-        "update_with_oracle_cut_size": 111,
-    }
-    ner = nlp.create_pipe("ner", config=config)
-    with make_tempdir() as tmp_path:
-        with (tmp_path / "ner.pkl").open("wb") as file_:
-            pickle.dump(ner, file_)
-            assert ner.cfg["update_with_oracle_cut_size"] == 111
-
-        with (tmp_path / "ner.pkl").open("rb") as file_:
-            ner2 = pickle.load(file_)
-            assert ner2.cfg["update_with_oracle_cut_size"] == 111
-
-
-@pytest.mark.issue(4725)
-def test_issue4725_2():
-    if isinstance(get_current_ops, NumpyOps):
-        # ensures that this runs correctly and doesn't hang or crash because of the global vectors
-        # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
-        # or because of issues with pickling the NER (cf test_issue4725_1)
-        vocab = Vocab(vectors_name="test_vocab_add_vector")
-        data = numpy.ndarray((5, 3), dtype="f")
-        data[0] = 1.0
-        data[1] = 2.0
-        vocab.set_vector("cat", data[0])
-        vocab.set_vector("dog", data[1])
-        nlp = English(vocab=vocab)
-        nlp.add_pipe("ner")
-        nlp.initialize()
-        docs = ["Kurt is in London."] * 10
-        for _ in nlp.pipe(docs, batch_size=2, n_process=2):
-            pass
-
-
-@pytest.mark.issue(4849)
-def test_issue4849():
-    nlp = English()
-    patterns = [
-        {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
-        {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
-    ]
-    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
-    ruler.add_patterns(patterns)
-    text = """
-    The left is starting to take aim at Democratic front-runner Joe Biden.
-    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
-    """
-    # USING 1 PROCESS
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=1):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
-    # USING 2 PROCESSES
-    if isinstance(get_current_ops, NumpyOps):
-        count_ents = 0
-        for doc in nlp.pipe([text], n_process=2):
-            count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-        assert count_ents == 2
-
-
-@Language.factory("my_pipe")
-class CustomPipe:
-    def __init__(self, nlp, name="my_pipe"):
-        self.name = name
-        Span.set_extension("my_ext", getter=self._get_my_ext)
-        Doc.set_extension("my_ext", default=None)
-
-    def __call__(self, doc):
-        gathered_ext = []
-        for sent in doc.sents:
-            sent_ext = self._get_my_ext(sent)
-            sent._.set("my_ext", sent_ext)
-            gathered_ext.append(sent_ext)
-
-        doc._.set("my_ext", "\n".join(gathered_ext))
-        return doc
-
-    @staticmethod
-    def _get_my_ext(span):
-        return str(span.end)
-
-
-@pytest.mark.issue(4903)
-def test_issue4903():
-    """Ensure that this runs correctly and doesn't hang or crash on Windows /
-    macOS."""
-    nlp = English()
-    nlp.add_pipe("sentencizer")
-    nlp.add_pipe("my_pipe", after="sentencizer")
-    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
-    if isinstance(get_current_ops(), NumpyOps):
-        docs = list(nlp.pipe(text, n_process=2))
-        assert docs[0].text == "I like bananas."
-        assert docs[1].text == "Do you like them?"
-        assert docs[2].text == "No, I prefer wasabi."
-
-
-@pytest.mark.issue(4924)
-def test_issue4924():
-    nlp = Language()
-    example = Example.from_dict(nlp.make_doc(""), {})
-    nlp.evaluate([example])
diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py
deleted file mode 100644
index e1f5231e7..000000000
--- a/spacy/tests/regression/test_issue5001-5500.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import numpy
-from spacy.tokens import Doc, DocBin
-from spacy.attrs import DEP, POS, TAG
-from spacy.lang.en import English
-from spacy.language import Language
-from spacy.lang.en.syntax_iterators import noun_chunks
-from spacy.vocab import Vocab
-import spacy
-from thinc.api import get_current_ops
-import pytest
-
-from ...util import make_tempdir
-
-
-@pytest.mark.issue(5048)
-def test_issue5048(en_vocab):
-    words = ["This", "is", "a", "sentence"]
-    pos_s = ["DET", "VERB", "DET", "NOUN"]
-    spaces = [" ", " ", " ", ""]
-    deps_s = ["dep", "adj", "nn", "atm"]
-    tags_s = ["DT", "VBZ", "DT", "NN"]
-    strings = en_vocab.strings
-    for w in words:
-        strings.add(w)
-    deps = [strings.add(d) for d in deps_s]
-    pos = [strings.add(p) for p in pos_s]
-    tags = [strings.add(t) for t in tags_s]
-    attrs = [POS, DEP, TAG]
-    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
-    doc = Doc(en_vocab, words=words, spaces=spaces)
-    doc.from_array(attrs, array)
-    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
-    doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
-    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
-    assert v1 == v2
-
-
-@pytest.mark.issue(5082)
-def test_issue5082():
-    # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
-    nlp = English()
-    vocab = nlp.vocab
-    array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32)
-    array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32)
-    array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32)
-    array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32)
-    array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32)
-    vocab.set_vector("I", array1)
-    vocab.set_vector("like", array2)
-    vocab.set_vector("David", array3)
-    vocab.set_vector("Bowie", array4)
-    text = "I like David Bowie"
-    patterns = [
-        {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
-    ]
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
-    parsed_vectors_1 = [t.vector for t in nlp(text)]
-    assert len(parsed_vectors_1) == 4
-    ops = get_current_ops()
-    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1)
-    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2)
-    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3)
-    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4)
-    nlp.add_pipe("merge_entities")
-    parsed_vectors_2 = [t.vector for t in nlp(text)]
-    assert len(parsed_vectors_2) == 3
-    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1)
-    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2)
-    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34)
-
-
-@pytest.mark.issue(5137)
-def test_issue5137():
-    factory_name = "test_issue5137"
-    pipe_name = "my_component"
-
-    @Language.factory(factory_name)
-    class MyComponent:
-        def __init__(self, nlp, name=pipe_name, categories="all_categories"):
-            self.nlp = nlp
-            self.categories = categories
-            self.name = name
-
-        def __call__(self, doc):
-            pass
-
-        def to_disk(self, path, **kwargs):
-            pass
-
-        def from_disk(self, path, **cfg):
-            pass
-
-    nlp = English()
-    my_component = nlp.add_pipe(factory_name, name=pipe_name)
-    assert my_component.categories == "all_categories"
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir)
-        overrides = {"components": {pipe_name: {"categories": "my_categories"}}}
-        nlp2 = spacy.load(tmpdir, config=overrides)
-        assert nlp2.get_pipe(pipe_name).categories == "my_categories"
-
-
-@pytest.mark.issue(5141)
-def test_issue5141(en_vocab):
-    """Ensure an empty DocBin does not crash on serialization"""
-    doc_bin = DocBin(attrs=["DEP", "HEAD"])
-    assert list(doc_bin.get_docs(en_vocab)) == []
-    doc_bin_bytes = doc_bin.to_bytes()
-    doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
-    assert list(doc_bin_2.get_docs(en_vocab)) == []
-
-
-@pytest.mark.issue(5152)
-def test_issue5152():
-    # Test that the comparison between a Span and a Token, goes well
-    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
-    nlp = English()
-    text = nlp("Talk about being boring!")
-    text_var = nlp("Talk of being boring!")
-    y = nlp("Let")
-    span = text[0:3]  # Talk about being
-    span_2 = text[0:3]  # Talk about being
-    span_3 = text_var[0:3]  # Talk of being
-    token = y[0]  # Let
-    with pytest.warns(UserWarning):
-        assert span.similarity(token) == 0.0
-    assert span.similarity(span_2) == 1.0
-    with pytest.warns(UserWarning):
-        assert span_2.similarity(span_3) < 1.0
-
-
-@pytest.mark.issue(5458)
-def test_issue5458():
-    # Test that the noun chuncker does not generate overlapping spans
-    # fmt: off
-    words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
-    vocab = Vocab(strings=words)
-    deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
-    pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
-    heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
-    # fmt: on
-    en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps)
-    en_doc.noun_chunks_iterator = noun_chunks
-
-    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
-    nlp = English()
-    merge_nps = nlp.create_pipe("merge_noun_chunks")
-    merge_nps(en_doc)
diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py
deleted file mode 100644
index 87c40ec2a..000000000
--- a/spacy/tests/regression/test_issue5501-6000.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import pytest
-from numpy.testing import assert_almost_equal
-from thinc.api import Config, fix_random_seed, get_current_ops
-
-from spacy.lang.en import English
-from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config
-from spacy.pipeline.textcat import single_label_cnn_config
-from spacy.pipeline.textcat_multilabel import multi_label_default_config
-from spacy.pipeline.textcat_multilabel import multi_label_bow_config
-from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
-from spacy.tokens import Span
-from spacy import displacy
-from spacy.pipeline import merge_entities
-from spacy.training import Example
-
-
-@pytest.mark.parametrize(
-    "textcat_config",
-    [
-        single_label_default_config,
-        single_label_bow_config,
-        single_label_cnn_config,
-        multi_label_default_config,
-        multi_label_bow_config,
-        multi_label_cnn_config,
-    ],
-)
-@pytest.mark.issue(5551)
-def test_issue5551(textcat_config):
-    """Test that after fixing the random seed, the results of the pipeline are truly identical"""
-    component = "textcat"
-
-    pipe_cfg = Config().from_str(textcat_config)
-    results = []
-    for i in range(3):
-        fix_random_seed(0)
-        nlp = English()
-        text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g."
-        annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}
-        pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
-        for label in set(annots["cats"]):
-            pipe.add_label(label)
-        # Train
-        nlp.initialize()
-        doc = nlp.make_doc(text)
-        nlp.update([Example.from_dict(doc, annots)])
-        # Store the result of each iteration
-        result = pipe.model.predict([doc])
-        results.append(result[0])
-    # All results should be the same because of the fixed seed
-    assert len(results) == 3
-    ops = get_current_ops()
-    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
-    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
-
-
-@pytest.mark.issue(5838)
-def test_issue5838():
-    # Displacy's EntityRenderer break line
-    # not working after last entity
-    sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n"
-    nlp = English()
-    doc = nlp(sample_text)
-    doc.ents = [Span(doc, 7, 8, label="test")]
-    html = displacy.render(doc, style="ent")
-    found = html.count("</br>")
-    assert found == 4
-
-
-@pytest.mark.issue(5918)
-def test_issue5918():
-    # Test edge case when merging entities.
-    nlp = English()
-    ruler = nlp.add_pipe("entity_ruler")
-    patterns = [
-        {"label": "ORG", "pattern": "Digicon Inc"},
-        {"label": "ORG", "pattern": "Rotan Mosle Inc's"},
-        {"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"},
-    ]
-    ruler.add_patterns(patterns)
-
-    text = """
-        Digicon Inc said it has completed the previously-announced disposition
-        of its computer systems division to an investment group led by
-        Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate.
-        """
-    doc = nlp(text)
-    assert len(doc.ents) == 3
-    # make it so that the third span's head is within the entity (ent_iob=I)
-    # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
-    # TODO: test for logging here
-    # with pytest.warns(UserWarning):
-    #     doc[29].head = doc[33]
-    doc = merge_entities(doc)
-    assert len(doc.ents) == 3
diff --git a/spacy/tests/regression/test_issue6001-6500.py b/spacy/tests/regression/test_issue6001-6500.py
deleted file mode 100644
index cb27d39e4..000000000
--- a/spacy/tests/regression/test_issue6001-6500.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from spacy.util import filter_spans
-from pydantic import ValidationError
-from spacy.schemas import TokenPattern, TokenPatternSchema
-import pytest
-
-
-@pytest.mark.issue(6207)
-def test_issue6207(en_tokenizer):
-    doc = en_tokenizer("zero one two three four five six")
-
-    # Make spans
-    s1 = doc[:4]
-    s2 = doc[3:6]  # overlaps with s1
-    s3 = doc[5:7]  # overlaps with s2, not s1
-
-    result = filter_spans((s1, s2, s3))
-    assert s1 in result
-    assert s2 not in result
-    assert s3 in result
-
-
-@pytest.mark.issue(6258)
-def test_issue6258():
-    """Test that the non-empty constraint pattern field is respected"""
-    # These one is valid
-    TokenPatternSchema(pattern=[TokenPattern()])
-    # But an empty pattern list should fail to validate
-    # based on the schema's constraint
-    with pytest.raises(ValidationError):
-        TokenPatternSchema(pattern=[])
diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py
deleted file mode 100644
index 84517d79b..000000000
--- a/spacy/tests/regression/test_issue6501-7000.py
+++ /dev/null
@@ -1,238 +0,0 @@
-import pytest
-from spacy.lang.en import English
-import numpy as np
-import spacy
-from spacy.tokens import Doc
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import DocBin
-from spacy.util import load_config_from_str
-from spacy.training import Example
-from spacy.training.initialize import init_nlp
-import pickle
-
-from ..util import make_tempdir
-
-
-@pytest.mark.issue(6730)
-def test_issue6730(en_vocab):
-    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
-    from spacy.kb import KnowledgeBase
-
-    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
-    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
-
-    with pytest.raises(ValueError):
-        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
-    assert kb.contains_alias("") is False
-
-    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
-    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
-
-    with make_tempdir() as tmp_dir:
-        kb.to_disk(tmp_dir)
-        kb.from_disk(tmp_dir)
-    assert kb.get_size_aliases() == 2
-    assert set(kb.get_alias_strings()) == {"x", "y"}
-
-
-@pytest.mark.issue(6755)
-def test_issue6755(en_tokenizer):
-    doc = en_tokenizer("This is a magnificent sentence.")
-    span = doc[:0]
-    assert span.text_with_ws == ""
-    assert span.text == ""
-
-
-@pytest.mark.parametrize(
-    "sentence, start_idx,end_idx,label",
-    [("Welcome to Mumbai, my friend", 11, 17, "GPE")],
-)
-@pytest.mark.issue(6815)
-def test_issue6815_1(sentence, start_idx, end_idx, label):
-    nlp = English()
-    doc = nlp(sentence)
-    span = doc[:].char_span(start_idx, end_idx, label=label)
-    assert span.label_ == label
-
-
-@pytest.mark.parametrize(
-    "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
-)
-@pytest.mark.issue(6815)
-def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
-    nlp = English()
-    doc = nlp(sentence)
-    span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
-    assert span.kb_id == kb_id
-
-
-@pytest.mark.parametrize(
-    "sentence, start_idx,end_idx,vector",
-    [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
-)
-@pytest.mark.issue(6815)
-def test_issue6815_3(sentence, start_idx, end_idx, vector):
-    nlp = English()
-    doc = nlp(sentence)
-    span = doc[:].char_span(start_idx, end_idx, vector=vector)
-    assert (span.vector == vector).all()
-
-
-@pytest.mark.issue(6839)
-def test_issue6839(en_vocab):
-    """Ensure that PhraseMatcher accepts Span as input"""
-    # fmt: off
-    words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
-    # fmt: on
-    doc = Doc(en_vocab, words=words)
-    span = doc[:8]
-    pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("SPACY", [pattern])
-    matches = matcher(span)
-    assert matches
-
-
-CONFIG_ISSUE_6908 = """
-[paths]
-train = "TRAIN_PLACEHOLDER"
-raw = null
-init_tok2vec = null
-vectors = null
-
-[system]
-seed = 0
-gpu_allocator = null
-
-[nlp]
-lang = "en"
-pipeline = ["textcat"]
-tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
-disabled = []
-before_creation = null
-after_creation = null
-after_pipeline_creation = null
-batch_size = 1000
-
-[components]
-
-[components.textcat]
-factory = "TEXTCAT_PLACEHOLDER"
-
-[corpora]
-
-[corpora.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-
-[corpora.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-
-
-[training]
-train_corpus = "corpora.train"
-dev_corpus = "corpora.dev"
-seed = ${system.seed}
-gpu_allocator = ${system.gpu_allocator}
-frozen_components = []
-before_to_disk = null
-
-[pretraining]
-
-[initialize]
-vectors = ${paths.vectors}
-init_tok2vec = ${paths.init_tok2vec}
-vocab_data = null
-lookups = null
-before_init = null
-after_init = null
-
-[initialize.components]
-
-[initialize.components.textcat]
-labels = ['label1', 'label2']
-
-[initialize.tokenizer]
-"""
-
-
-@pytest.mark.parametrize(
-    "component_name",
-    ["textcat", "textcat_multilabel"],
-)
-@pytest.mark.issue(6908)
-def test_issue6908(component_name):
-    """Test intializing textcat with labels in a list"""
-
-    def create_data(out_file):
-        nlp = spacy.blank("en")
-        doc = nlp.make_doc("Some text")
-        doc.cats = {"label1": 0, "label2": 1}
-        out_data = DocBin(docs=[doc]).to_bytes()
-        with out_file.open("wb") as file_:
-            file_.write(out_data)
-
-    with make_tempdir() as tmp_path:
-        train_path = tmp_path / "train.spacy"
-        create_data(train_path)
-        config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name)
-        config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix())
-        config = load_config_from_str(config_str)
-        init_nlp(config)
-
-
-CONFIG_ISSUE_6950 = """
-[nlp]
-lang = "en"
-pipeline = ["tok2vec", "tagger"]
-
-[components]
-
-[components.tok2vec]
-factory = "tok2vec"
-
-[components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.tok2vec.model.encode:width}
-attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
-rows = [5000,2500,2500,2500]
-include_static_vectors = false
-
-[components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
-
-[components.ner]
-factory = "ner"
-
-[components.tagger]
-factory = "tagger"
-
-[components.tagger.model]
-@architectures = "spacy.Tagger.v1"
-nO = null
-
-[components.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-upstream = "*"
-"""
-
-
-@pytest.mark.issue(6950)
-def test_issue6950():
-    """Test that the nlp object with initialized tok2vec with listeners pickles
-    correctly (and doesn't have lambdas).
-    """
-    nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
-    nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
-    pickle.dumps(nlp)
-    nlp("hello")
-    pickle.dumps(nlp)
diff --git a/spacy/tests/regression/test_issue7001-8000.py b/spacy/tests/regression/test_issue7001-8000.py
deleted file mode 100644
index 1164e85b9..000000000
--- a/spacy/tests/regression/test_issue7001-8000.py
+++ /dev/null
@@ -1,288 +0,0 @@
-import pytest
-from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type
-from spacy.lang.en import English
-from spacy.training import Example
-from spacy.tokens.doc import Doc
-from spacy.vocab import Vocab
-from spacy.kb import KnowledgeBase
-from spacy.pipeline._parser_internals.arc_eager import ArcEager
-from spacy.util import load_config_from_str, load_config
-from spacy.cli.init_config import fill_config
-from thinc.api import Config
-from wasabi import msg
-
-from ..util import make_tempdir
-
-
-@pytest.mark.issue(7019)
-def test_issue7019():
-    scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None}
-    print_textcats_auc_per_cat(msg, scores)
-    scores = {
-        "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932},
-        "LABEL_B": {"p": None, "r": None, "f": None},
-    }
-    print_prf_per_type(msg, scores, name="foo", type="bar")
-
-
-CONFIG_7029 = """
-[nlp]
-lang = "en"
-pipeline = ["tok2vec", "tagger"]
-
-[components]
-
-[components.tok2vec]
-factory = "tok2vec"
-
-[components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.tok2vec.model.encode:width}
-attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
-rows = [5000,2500,2500,2500]
-include_static_vectors = false
-
-[components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
-
-[components.tagger]
-factory = "tagger"
-
-[components.tagger.model]
-@architectures = "spacy.Tagger.v1"
-nO = null
-
-[components.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-upstream = "*"
-"""
-
-
-@pytest.mark.issue(7029)
-def test_issue7029():
-    """Test that an empty document doesn't mess up an entire batch."""
-    TRAIN_DATA = [
-        ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
-        ("Eat blue ham", {"tags": ["V", "J", "N"]}),
-    ]
-    nlp = English.from_config(load_config_from_str(CONFIG_7029))
-    train_examples = []
-    for t in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.initialize(get_examples=lambda: train_examples)
-    for i in range(50):
-        losses = {}
-        nlp.update(train_examples, sgd=optimizer, losses=losses)
-    texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
-    docs1 = list(nlp.pipe(texts, batch_size=1))
-    docs2 = list(nlp.pipe(texts, batch_size=4))
-    assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
-
-
-@pytest.mark.issue(7055)
-def test_issue7055():
-    """Test that fill-config doesn't turn sourced components into factories."""
-    source_cfg = {
-        "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]},
-        "components": {
-            "tok2vec": {"factory": "tok2vec"},
-            "tagger": {"factory": "tagger"},
-        },
-    }
-    source_nlp = English.from_config(source_cfg)
-    with make_tempdir() as dir_path:
-        # We need to create a loadable source pipeline
-        source_path = dir_path / "test_model"
-        source_nlp.to_disk(source_path)
-        base_cfg = {
-            "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
-            "components": {
-                "tok2vec": {"source": str(source_path)},
-                "tagger": {"source": str(source_path)},
-                "ner": {"factory": "ner"},
-            },
-        }
-        base_cfg = Config(base_cfg)
-        base_path = dir_path / "base.cfg"
-        base_cfg.to_disk(base_path)
-        output_path = dir_path / "config.cfg"
-        fill_config(output_path, base_path, silent=True)
-        filled_cfg = load_config(output_path)
-    assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path)
-    assert filled_cfg["components"]["tagger"]["source"] == str(source_path)
-    assert filled_cfg["components"]["ner"]["factory"] == "ner"
-    assert "model" in filled_cfg["components"]["ner"]
-
-
-@pytest.mark.issue(7056)
-def test_issue7056():
-    """Test that the Unshift transition works properly, and doesn't cause
-    sentence segmentation errors."""
-    vocab = Vocab()
-    ae = ArcEager(
-        vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])
-    )
-    doc = Doc(vocab, words="Severe pain , after trauma".split())
-    state = ae.init_batch([doc])[0]
-    ae.apply_transition(state, "S")
-    ae.apply_transition(state, "L-amod")
-    ae.apply_transition(state, "S")
-    ae.apply_transition(state, "S")
-    ae.apply_transition(state, "S")
-    ae.apply_transition(state, "R-pobj")
-    ae.apply_transition(state, "D")
-    ae.apply_transition(state, "D")
-    ae.apply_transition(state, "D")
-    assert not state.eol()
-
-
-def test_partial_links():
-    # Test that having some entities on the doc without gold links, doesn't crash
-    TRAIN_DATA = [
-        (
-            "Russ Cochran his reprints include EC Comics.",
-            {
-                "links": {(0, 12): {"Q2146908": 1.0}},
-                "entities": [(0, 12, "PERSON")],
-                "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0],
-            },
-        )
-    ]
-    nlp = English()
-    vector_length = 3
-    train_examples = []
-    for text, annotation in TRAIN_DATA:
-        doc = nlp(text)
-        train_examples.append(Example.from_dict(doc, annotation))
-
-    def create_kb(vocab):
-        # create artificial KB
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
-        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
-        mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
-        return mykb
-
-    # Create and train the Entity Linker
-    entity_linker = nlp.add_pipe("entity_linker", last=True)
-    entity_linker.set_kb(create_kb)
-    optimizer = nlp.initialize(get_examples=lambda: train_examples)
-    for i in range(2):
-        losses = {}
-        nlp.update(train_examples, sgd=optimizer, losses=losses)
-
-    # adding additional components that are required for the entity_linker
-    nlp.add_pipe("sentencizer", first=True)
-    patterns = [
-        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
-        {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
-    ]
-    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
-    ruler.add_patterns(patterns)
-
-    # this will run the pipeline on the examples and shouldn't crash
-    results = nlp.evaluate(train_examples)
-    assert "PERSON" in results["ents_per_type"]
-    assert "PERSON" in results["nel_f_per_type"]
-    assert "ORG" in results["ents_per_type"]
-    assert "ORG" not in results["nel_f_per_type"]
-
-
-@pytest.mark.issue(7065)
-def test_issue7065():
-    text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."
-    nlp = English()
-    nlp.add_pipe("sentencizer")
-    ruler = nlp.add_pipe("entity_ruler")
-    patterns = [
-        {
-            "label": "THING",
-            "pattern": [
-                {"LOWER": "symphony"},
-                {"LOWER": "no"},
-                {"LOWER": "."},
-                {"LOWER": "8"},
-            ],
-        }
-    ]
-    ruler.add_patterns(patterns)
-
-    doc = nlp(text)
-    sentences = [s for s in doc.sents]
-    assert len(sentences) == 2
-    sent0 = sentences[0]
-    ent = doc.ents[0]
-    assert ent.start < sent0.end < ent.end
-    assert sentences.index(ent.sent) == 0
-
-
-@pytest.mark.issue(7065)
-def test_issue7065_b():
-    # Test that the NEL doesn't crash when an entity crosses a sentence boundary
-    nlp = English()
-    vector_length = 3
-    nlp.add_pipe("sentencizer")
-    text = "Mahler 's Symphony No. 8 was beautiful."
-    entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
-    links = {
-        (0, 6): {"Q7304": 1.0, "Q270853": 0.0},
-        (10, 24): {"Q7304": 0.0, "Q270853": 1.0},
-    }
-    sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
-    doc = nlp(text)
-    example = Example.from_dict(
-        doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
-    )
-    train_examples = [example]
-
-    def create_kb(vocab):
-        # create artificial KB
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
-        mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
-        mykb.add_alias(
-            alias="No. 8",
-            entities=["Q270853"],
-            probabilities=[1.0],
-        )
-        mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3])
-        mykb.add_alias(
-            alias="Mahler",
-            entities=["Q7304"],
-            probabilities=[1.0],
-        )
-        return mykb
-
-    # Create the Entity Linker component and add it to the pipeline
-    entity_linker = nlp.add_pipe("entity_linker", last=True)
-    entity_linker.set_kb(create_kb)
-    # train the NEL pipe
-    optimizer = nlp.initialize(get_examples=lambda: train_examples)
-    for i in range(2):
-        losses = {}
-        nlp.update(train_examples, sgd=optimizer, losses=losses)
-
-    # Add a custom rule-based component to mimick NER
-    patterns = [
-        {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
-        {
-            "label": "WORK",
-            "pattern": [
-                {"LOWER": "symphony"},
-                {"LOWER": "no"},
-                {"LOWER": "."},
-                {"LOWER": "8"},
-            ],
-        },
-    ]
-    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
-    ruler.add_patterns(patterns)
-    # test the trained model - this should not throw E148
-    doc = nlp(text)
-    assert doc
diff --git a/spacy/tests/regression/test_issue7716.py b/spacy/tests/regression/test_issue7716.py
deleted file mode 100644
index d9b3967ff..000000000
--- a/spacy/tests/regression/test_issue7716.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import pytest
-from thinc.api import Adam
-from spacy.attrs import NORM
-from spacy.vocab import Vocab
-from spacy import registry
-from spacy.training import Example
-from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
-from spacy.tokens import Doc
-from spacy.pipeline import DependencyParser
-
-
-@pytest.fixture
-def vocab():
-    return Vocab(lex_attr_getters={NORM: lambda s: s})
-
-
-def _parser_example(parser):
-    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
-    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
-    return Example.from_dict(doc, gold)
-
-
-@pytest.fixture
-def parser(vocab):
-    vocab.strings.add("ROOT")
-    cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.resolve(cfg, validate=True)["model"]
-    parser = DependencyParser(vocab, model)
-    parser.cfg["token_vector_width"] = 4
-    parser.cfg["hidden_width"] = 32
-    # parser.add_label('right')
-    parser.add_label("left")
-    parser.initialize(lambda: [_parser_example(parser)])
-    sgd = Adam(0.001)
-
-    for i in range(10):
-        losses = {}
-        doc = Doc(vocab, words=["a", "b", "c", "d"])
-        example = Example.from_dict(
-            doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
-        )
-        parser.update([example], sgd=sgd, losses=losses)
-    return parser
-
-
-@pytest.mark.issue(7716)
-@pytest.mark.xfail(reason="Not fixed yet")
-def test_partial_annotation(parser):
-    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
-    doc[2].is_sent_start = False
-    # Note that if the following line is used, then doc[2].is_sent_start == False
-    # doc[3].is_sent_start = False
-
-    doc = parser(doc)
-    assert doc[2].is_sent_start == False
diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py
deleted file mode 100644
index e3f3b5cfa..000000000
--- a/spacy/tests/regression/test_issue8168.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import pytest
-from spacy.lang.en import English
-
-
-@pytest.mark.issue(8168)
-def test_issue8168():
-    nlp = English()
-    ruler = nlp.add_pipe("entity_ruler")
-    patterns = [
-        {"label": "ORG", "pattern": "Apple"},
-        {
-            "label": "GPE",
-            "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}],
-            "id": "san-francisco",
-        },
-        {
-            "label": "GPE",
-            "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}],
-            "id": "san-francisco",
-        },
-    ]
-    ruler.add_patterns(patterns)
-
-    assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")}
diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py
deleted file mode 100644
index 0b2f2824b..000000000
--- a/spacy/tests/regression/test_issue8190.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import pytest
-
-import spacy
-from spacy.lang.en import English
-from ..util import make_tempdir
-
-
-@pytest.mark.issue(8190)
-def test_issue8190():
-    """Test that config overrides are not lost after load is complete."""
-    source_cfg = {
-        "nlp": {
-            "lang": "en",
-        },
-        "custom": {"key": "value"},
-    }
-    source_nlp = English.from_config(source_cfg)
-    with make_tempdir() as dir_path:
-        # We need to create a loadable source pipeline
-        source_path = dir_path / "test_model"
-        source_nlp.to_disk(source_path)
-        nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}})
-
-        assert nlp.config["custom"]["key"] == "updated_value"
diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py
deleted file mode 100644
index 0370074fe..000000000
--- a/spacy/tests/regression/test_issue8216.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import pytest
-
-from spacy import registry
-from spacy.language import Language
-
-
-@pytest.fixture
-def nlp():
-    return Language()
-
-
-@pytest.fixture
-@registry.misc("entity_ruler_patterns")
-def patterns():
-    return [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-        {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
-    ]
-
-
-@pytest.mark.issue(8216)
-def test_entity_ruler_fix8216(nlp, patterns):
-    """Test that patterns don't get added excessively."""
-    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
-    ruler.add_patterns(patterns)
-    pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
-    assert pattern_count > 0
-    ruler.add_patterns([])
-    after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
-    assert after_count == pattern_count
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 102989705..1d50fd1d1 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -1,20 +1,17 @@
 import pytest
-from thinc.api import Config, ConfigValidationError
-import spacy
-from spacy.lang.en import English
-from spacy.lang.de import German
-from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
-from spacy.util import (
-    registry,
-    load_model_from_config,
-    load_config,
-    load_config_from_str,
-)
-from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
-from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
-from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
 from catalogue import RegistryError
+from thinc.api import Config, ConfigValidationError
 
+import spacy
+from spacy.lang.de import German
+from spacy.lang.en import English
+from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
+from spacy.language import Language
+from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
+from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
+from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
+from spacy.util import load_config, load_config_from_str
+from spacy.util import load_model_from_config, registry
 
 from ..util import make_tempdir
 
@@ -187,6 +184,25 @@ def my_parser():
     return parser
 
 
+@pytest.mark.issue(8190)
+def test_issue8190():
+    """Test that config overrides are not lost after load is complete."""
+    source_cfg = {
+        "nlp": {
+            "lang": "en",
+        },
+        "custom": {"key": "value"},
+    }
+    source_nlp = English.from_config(source_cfg)
+    with make_tempdir() as dir_path:
+        # We need to create a loadable source pipeline
+        source_path = dir_path / "test_model"
+        source_nlp.to_disk(source_path)
+        nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}})
+
+        assert nlp.config["custom"]["key"] == "updated_value"
+
+
 def test_create_nlp_from_config():
     config = Config().from_str(nlp_config_string)
     with pytest.raises(ConfigValidationError):
diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index 23afaf26c..15bf67bfd 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -1,13 +1,168 @@
-import pytest
-from spacy.tokens.underscore import Underscore
+import copy
+import pickle
 
-import spacy
+import numpy
+import pytest
+
+from spacy.attrs import DEP, HEAD
 from spacy.lang.en import English
-from spacy.tokens import Doc, DocBin
+from spacy.language import Language
+from spacy.matcher import Matcher, PhraseMatcher
+from spacy.tokens import Doc
+from spacy.vectors import Vectors
+from spacy.vocab import Vocab
 
 from ..util import make_tempdir
 
 
+@pytest.mark.issue(1727)
+def test_issue1727():
+    """Test that models with no pretrained vectors can be deserialized
+    correctly after vectors are added."""
+    nlp = Language(Vocab())
+    data = numpy.ones((3, 300), dtype="f")
+    vectors = Vectors(data=data, keys=["I", "am", "Matt"])
+    tagger = nlp.create_pipe("tagger")
+    tagger.add_label("PRP")
+    assert tagger.cfg.get("pretrained_dims", 0) == 0
+    tagger.vocab.vectors = vectors
+    with make_tempdir() as path:
+        tagger.to_disk(path)
+        tagger = nlp.create_pipe("tagger").from_disk(path)
+        assert tagger.cfg.get("pretrained_dims", 0) == 0
+
+
+@pytest.mark.issue(1799)
+def test_issue1799():
+    """Test sentence boundaries are deserialized correctly, even for
+    non-projective sentences."""
+    heads_deps = numpy.asarray(
+        [
+            [1, 397],
+            [4, 436],
+            [2, 426],
+            [1, 402],
+            [0, 8206900633647566924],
+            [18446744073709551615, 440],
+            [18446744073709551614, 442],
+        ],
+        dtype="uint64",
+    )
+    doc = Doc(Vocab(), words="Just what I was looking for .".split())
+    doc.vocab.strings.add("ROOT")
+    doc = doc.from_array([HEAD, DEP], heads_deps)
+    assert len(list(doc.sents)) == 1
+
+
+@pytest.mark.issue(1834)
+def test_issue1834():
+    """Test that sentence boundaries & parse/tag flags are not lost
+    during serialization."""
+    words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
+    doc = Doc(Vocab(), words=words)
+    doc[6].is_sent_start = True
+    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
+    assert new_doc[6].sent_start
+    assert not new_doc.has_annotation("DEP")
+    assert not new_doc.has_annotation("TAG")
+    doc = Doc(
+        Vocab(),
+        words=words,
+        tags=["TAG"] * len(words),
+        heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
+        deps=["dep"] * len(words),
+    )
+    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
+    assert new_doc[6].sent_start
+    assert new_doc.has_annotation("DEP")
+    assert new_doc.has_annotation("TAG")
+
+
+@pytest.mark.issue(1883)
+def test_issue1883():
+    matcher = Matcher(Vocab())
+    matcher.add("pat1", [[{"orth": "hello"}]])
+    doc = Doc(matcher.vocab, words=["hello"])
+    assert len(matcher(doc)) == 1
+    new_matcher = copy.deepcopy(matcher)
+    new_doc = Doc(new_matcher.vocab, words=["hello"])
+    assert len(new_matcher(new_doc)) == 1
+
+
+@pytest.mark.issue(2564)
+def test_issue2564():
+    """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
+    nlp = Language()
+    tagger = nlp.add_pipe("tagger")
+    tagger.add_label("A")
+    nlp.initialize()
+    doc = nlp("hello world")
+    assert doc.has_annotation("TAG")
+    docs = nlp.pipe(["hello", "world"])
+    piped_doc = next(docs)
+    assert piped_doc.has_annotation("TAG")
+
+
+@pytest.mark.issue(3248)
+def test_issue3248_2():
+    """Test that the PhraseMatcher can be pickled correctly."""
+    nlp = English()
+    matcher = PhraseMatcher(nlp.vocab)
+    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
+    matcher.add("TEST2", [nlp("d")])
+    data = pickle.dumps(matcher)
+    new_matcher = pickle.loads(data)
+    assert len(new_matcher) == len(matcher)
+
+
+@pytest.mark.issue(3289)
+def test_issue3289():
+    """Test that Language.to_bytes handles serializing a pipeline component
+    with an uninitialized model."""
+    nlp = English()
+    nlp.add_pipe("textcat")
+    bytes_data = nlp.to_bytes()
+    new_nlp = English()
+    new_nlp.add_pipe("textcat")
+    new_nlp.from_bytes(bytes_data)
+
+
+@pytest.mark.issue(3468)
+def test_issue3468():
+    """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
+    be restored after serialization."""
+    nlp = English()
+    nlp.add_pipe("sentencizer")
+    doc = nlp("Hello world")
+    assert doc[0].is_sent_start
+    assert doc.has_annotation("SENT_START")
+    assert len(list(doc.sents)) == 1
+    doc_bytes = doc.to_bytes()
+    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
+    assert new_doc[0].is_sent_start
+    assert new_doc.has_annotation("SENT_START")
+    assert len(list(new_doc.sents)) == 1
+
+
+@pytest.mark.issue(3959)
+def test_issue3959():
+    """Ensure that a modified pos attribute is serialized correctly."""
+    nlp = English()
+    doc = nlp(
+        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
+    )
+    assert doc[0].pos_ == ""
+    doc[0].pos_ = "NOUN"
+    assert doc[0].pos_ == "NOUN"
+    # usually this is already True when starting from proper models instead of blank English
+    with make_tempdir() as tmp_dir:
+        file_path = tmp_dir / "my_doc"
+        doc.to_disk(file_path)
+        doc2 = nlp("")
+        doc2.from_disk(file_path)
+        assert doc2[0].pos_ == "NOUN"
+
+
 def test_serialize_empty_doc(en_vocab):
     doc = Doc(en_vocab)
     data = doc.to_bytes()
@@ -61,69 +216,3 @@ def test_serialize_doc_span_groups(en_vocab):
     doc.spans["content"] = [doc[0:2]]
     new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
     assert len(new_doc.spans["content"]) == 1
-
-
-def test_serialize_doc_bin():
-    doc_bin = DocBin(
-        attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True
-    )
-    texts = ["Some text", "Lots of texts...", "..."]
-    cats = {"A": 0.5}
-    nlp = English()
-    for doc in nlp.pipe(texts):
-        doc.cats = cats
-        doc.spans["start"] = [doc[0:2]]
-        doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
-        doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
-        doc_bin.add(doc)
-    bytes_data = doc_bin.to_bytes()
-
-    # Deserialize later, e.g. in a new process
-    nlp = spacy.blank("en")
-    doc_bin = DocBin().from_bytes(bytes_data)
-    reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
-    for i, doc in enumerate(reloaded_docs):
-        assert doc.text == texts[i]
-        assert doc.cats == cats
-        assert len(doc.spans) == 1
-        assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
-        assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
-
-
-def test_serialize_doc_bin_unknown_spaces(en_vocab):
-    doc1 = Doc(en_vocab, words=["that", "'s"])
-    assert doc1.has_unknown_spaces
-    assert doc1.text == "that 's "
-    doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
-    assert not doc2.has_unknown_spaces
-    assert doc2.text == "that's"
-
-    doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
-    re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
-    assert re_doc1.has_unknown_spaces
-    assert re_doc1.text == "that 's "
-    assert not re_doc2.has_unknown_spaces
-    assert re_doc2.text == "that's"
-
-
-@pytest.mark.parametrize(
-    "writer_flag,reader_flag,reader_value",
-    [
-        (True, True, "bar"),
-        (True, False, "bar"),
-        (False, True, "nothing"),
-        (False, False, "nothing"),
-    ],
-)
-def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value):
-    """Test that custom extensions are correctly serialized in DocBin."""
-    Doc.set_extension("foo", default="nothing")
-    doc = Doc(en_vocab, words=["hello", "world"])
-    doc._.foo = "bar"
-    doc_bin_1 = DocBin(store_user_data=writer_flag)
-    doc_bin_1.add(doc)
-    doc_bin_bytes = doc_bin_1.to_bytes()
-    doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes)
-    doc_2 = list(doc_bin_2.get_docs(en_vocab))[0]
-    assert doc_2._.foo == reader_value
-    Underscore.doc_extensions = {}
diff --git a/spacy/tests/serialize/test_serialize_docbin.py b/spacy/tests/serialize/test_serialize_docbin.py
new file mode 100644
index 000000000..9f8e5e06b
--- /dev/null
+++ b/spacy/tests/serialize/test_serialize_docbin.py
@@ -0,0 +1,106 @@
+import pytest
+
+import spacy
+from spacy.lang.en import English
+from spacy.tokens import Doc, DocBin
+from spacy.tokens.underscore import Underscore
+
+
+@pytest.mark.issue(4367)
+def test_issue4367():
+    """Test that docbin init goes well"""
+    DocBin()
+    DocBin(attrs=["LEMMA"])
+    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
+
+
+@pytest.mark.issue(4528)
+def test_issue4528(en_vocab):
+    """Test that user_data is correctly serialized in DocBin."""
+    doc = Doc(en_vocab, words=["hello", "world"])
+    doc.user_data["foo"] = "bar"
+    # This is how extension attribute values are stored in the user data
+    doc.user_data[("._.", "foo", None, None)] = "bar"
+    doc_bin = DocBin(store_user_data=True)
+    doc_bin.add(doc)
+    doc_bin_bytes = doc_bin.to_bytes()
+    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
+    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
+    assert new_doc.user_data["foo"] == "bar"
+    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
+
+
+@pytest.mark.issue(5141)
+def test_issue5141(en_vocab):
+    """Ensure an empty DocBin does not crash on serialization"""
+    doc_bin = DocBin(attrs=["DEP", "HEAD"])
+    assert list(doc_bin.get_docs(en_vocab)) == []
+    doc_bin_bytes = doc_bin.to_bytes()
+    doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
+    assert list(doc_bin_2.get_docs(en_vocab)) == []
+
+
+def test_serialize_doc_bin():
+    doc_bin = DocBin(
+        attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True
+    )
+    texts = ["Some text", "Lots of texts...", "..."]
+    cats = {"A": 0.5}
+    nlp = English()
+    for doc in nlp.pipe(texts):
+        doc.cats = cats
+        doc.spans["start"] = [doc[0:2]]
+        doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
+        doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
+        doc_bin.add(doc)
+    bytes_data = doc_bin.to_bytes()
+
+    # Deserialize later, e.g. in a new process
+    nlp = spacy.blank("en")
+    doc_bin = DocBin().from_bytes(bytes_data)
+    reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
+    for i, doc in enumerate(reloaded_docs):
+        assert doc.text == texts[i]
+        assert doc.cats == cats
+        assert len(doc.spans) == 1
+        assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
+        assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
+
+
+def test_serialize_doc_bin_unknown_spaces(en_vocab):
+    doc1 = Doc(en_vocab, words=["that", "'s"])
+    assert doc1.has_unknown_spaces
+    assert doc1.text == "that 's "
+    doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
+    assert not doc2.has_unknown_spaces
+    assert doc2.text == "that's"
+
+    doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
+    re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
+    assert re_doc1.has_unknown_spaces
+    assert re_doc1.text == "that 's "
+    assert not re_doc2.has_unknown_spaces
+    assert re_doc2.text == "that's"
+
+
+@pytest.mark.parametrize(
+    "writer_flag,reader_flag,reader_value",
+    [
+        (True, True, "bar"),
+        (True, False, "bar"),
+        (False, True, "nothing"),
+        (False, False, "nothing"),
+    ],
+)
+def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value):
+    """Test that custom extensions are correctly serialized in DocBin."""
+    Doc.set_extension("foo", default="nothing")
+    doc = Doc(en_vocab, words=["hello", "world"])
+    doc._.foo = "bar"
+    doc_bin_1 = DocBin(store_user_data=writer_flag)
+    doc_bin_1.add(doc)
+    doc_bin_bytes = doc_bin_1.to_bytes()
+    doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes)
+    doc_2 = list(doc_bin_2.get_docs(en_vocab))[0]
+    assert doc_2._.foo == reader_value
+    Underscore.doc_extensions = {}
diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py
index 05529f9d1..6e7fa0e4e 100644
--- a/spacy/tests/serialize/test_serialize_language.py
+++ b/spacy/tests/serialize/test_serialize_language.py
@@ -1,8 +1,14 @@
-import pytest
 import re
+import pickle
+
+import pytest
 
 from spacy.language import Language
+from spacy.lang.it import Italian
+from spacy.lang.en import English
 from spacy.tokenizer import Tokenizer
+from spacy.training import Example
+from spacy.util import load_config_from_str
 
 from ..util import make_tempdir
 
@@ -21,6 +27,71 @@ def meta_data():
     }
 
 
+@pytest.mark.issue(2482)
+def test_issue2482():
+    """Test we can serialize and deserialize a blank NER or parser model."""
+    nlp = Italian()
+    nlp.add_pipe("ner")
+    b = nlp.to_bytes()
+    Italian().from_bytes(b)
+
+
+CONFIG_ISSUE_6950 = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tok2vec.model.encode:width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[components.ner]
+factory = "ner"
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+upstream = "*"
+"""
+
+
+@pytest.mark.issue(6950)
+def test_issue6950():
+    """Test that the nlp object with initialized tok2vec with listeners pickles
+    correctly (and doesn't have lambdas).
+    """
+    nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950))
+    nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})])
+    pickle.dumps(nlp)
+    nlp("hello")
+    pickle.dumps(nlp)
+
+
 def test_serialize_language_meta_disk(meta_data):
     language = Language(meta=meta_data)
     with make_tempdir() as d:
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index eebf72638..9fcf18e2d 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -1,18 +1,25 @@
+import pickle
+
 import pytest
-from spacy import registry, Vocab, load
-from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
-from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe
+import srsly
+from thinc.api import Linear
+
+import spacy
+from spacy import Vocab, load, registry
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.pipeline import DependencyParser, EntityRecognizer, EntityRuler
+from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
+from spacy.pipeline import TrainablePipe
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
 from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
-from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
-from spacy.lang.en import English
-from thinc.api import Linear
-import spacy
+from spacy.util import ensure_path, load_model
+from spacy.tokens import Span
 
 from ..util import make_tempdir
 
-
 test_parsers = [DependencyParser, EntityRecognizer]
 
 
@@ -58,6 +65,181 @@ def taggers(en_vocab):
     return tagger1, tagger2
 
 
+@pytest.mark.issue(3456)
+def test_issue3456():
+    # this crashed because of a padding error in layer.ops.unflatten in thinc
+    nlp = English()
+    tagger = nlp.add_pipe("tagger")
+    tagger.add_label("A")
+    nlp.initialize()
+    list(nlp.pipe(["hi", ""]))
+
+
+@pytest.mark.issue(3526)
+def test_issue_3526_1(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    ruler_bytes = ruler.to_bytes()
+    assert len(ruler) == len(patterns)
+    assert len(ruler.labels) == 4
+    assert ruler.overwrite
+    new_ruler = EntityRuler(nlp)
+    new_ruler = new_ruler.from_bytes(ruler_bytes)
+    assert len(new_ruler) == len(ruler)
+    assert len(new_ruler.labels) == 4
+    assert new_ruler.overwrite == ruler.overwrite
+    assert new_ruler.ent_id_sep == ruler.ent_id_sep
+
+
+@pytest.mark.issue(3526)
+def test_issue_3526_2(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
+    new_ruler = EntityRuler(nlp)
+    new_ruler = new_ruler.from_bytes(bytes_old_style)
+    assert len(new_ruler) == len(ruler)
+    for pattern in ruler.patterns:
+        assert pattern in new_ruler.patterns
+    assert new_ruler.overwrite is not ruler.overwrite
+
+
+@pytest.mark.issue(3526)
+def test_issue_3526_3(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    with make_tempdir() as tmpdir:
+        out_file = tmpdir / "entity_ruler"
+        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
+        new_ruler = EntityRuler(nlp).from_disk(out_file)
+        for pattern in ruler.patterns:
+            assert pattern in new_ruler.patterns
+        assert len(new_ruler) == len(ruler)
+        assert new_ruler.overwrite is not ruler.overwrite
+
+
+@pytest.mark.issue(3526)
+def test_issue_3526_4(en_vocab):
+    nlp = Language(vocab=en_vocab)
+    patterns = [{"label": "ORG", "pattern": "Apple"}]
+    config = {"overwrite_ents": True}
+    ruler = nlp.add_pipe("entity_ruler", config=config)
+    ruler.add_patterns(patterns)
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir)
+        ruler = nlp.get_pipe("entity_ruler")
+        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert ruler.overwrite is True
+        nlp2 = load(tmpdir)
+        new_ruler = nlp2.get_pipe("entity_ruler")
+        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert new_ruler.overwrite is True
+
+
+@pytest.mark.issue(4042)
+def test_issue4042():
+    """Test that serialization of an EntityRuler before NER works fine."""
+    nlp = English()
+    # add ner pipe
+    ner = nlp.add_pipe("ner")
+    ner.add_label("SOME_LABEL")
+    nlp.initialize()
+    # Add entity ruler
+    patterns = [
+        {"label": "MY_ORG", "pattern": "Apple"},
+        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
+    ]
+    # works fine with "after"
+    ruler = nlp.add_pipe("entity_ruler", before="ner")
+    ruler.add_patterns(patterns)
+    doc1 = nlp("What do you think about Apple ?")
+    assert doc1.ents[0].label_ == "MY_ORG"
+
+    with make_tempdir() as d:
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        nlp.to_disk(output_dir)
+        nlp2 = load_model(output_dir)
+        doc2 = nlp2("What do you think about Apple ?")
+        assert doc2.ents[0].label_ == "MY_ORG"
+
+
+@pytest.mark.issue(4042)
+def test_issue4042_bug2():
+    """
+    Test that serialization of an NER works fine when new labels were added.
+    This is the second bug of two bugs underlying the issue 4042.
+    """
+    nlp1 = English()
+    # add ner pipe
+    ner1 = nlp1.add_pipe("ner")
+    ner1.add_label("SOME_LABEL")
+    nlp1.initialize()
+    # add a new label to the doc
+    doc1 = nlp1("What do you think about Apple ?")
+    assert len(ner1.labels) == 1
+    assert "SOME_LABEL" in ner1.labels
+    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
+    doc1.ents = list(doc1.ents) + [apple_ent]
+    # Add the label explicitly. Previously we didn't require this.
+    ner1.add_label("MY_ORG")
+    ner1(doc1)
+    assert len(ner1.labels) == 2
+    assert "SOME_LABEL" in ner1.labels
+    assert "MY_ORG" in ner1.labels
+    with make_tempdir() as d:
+        # assert IO goes fine
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        ner1.to_disk(output_dir)
+        config = {}
+        ner2 = nlp1.create_pipe("ner", config=config)
+        ner2.from_disk(output_dir)
+        assert len(ner2.labels) == 2
+
+
+@pytest.mark.issue(4725)
+def test_issue4725_1():
+    """Ensure the pickling of the NER goes well"""
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    nlp = English(vocab=vocab)
+    config = {
+        "update_with_oracle_cut_size": 111,
+    }
+    ner = nlp.create_pipe("ner", config=config)
+    with make_tempdir() as tmp_path:
+        with (tmp_path / "ner.pkl").open("wb") as file_:
+            pickle.dump(ner, file_)
+            assert ner.cfg["update_with_oracle_cut_size"] == 111
+
+        with (tmp_path / "ner.pkl").open("rb") as file_:
+            ner2 = pickle.load(file_)
+            assert ner2.cfg["update_with_oracle_cut_size"] == 111
+
+
 @pytest.mark.parametrize("Parser", test_parsers)
 def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
     cfg = {"model": DEFAULT_PARSER_MODEL}
diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py
index a9450cd04..e271f7707 100644
--- a/spacy/tests/serialize/test_serialize_tokenizer.py
+++ b/spacy/tests/serialize/test_serialize_tokenizer.py
@@ -1,9 +1,16 @@
-import pytest
+import pickle
 import re
-from spacy.util import get_lang_class
-from spacy.tokenizer import Tokenizer
 
-from ..util import make_tempdir, assert_packed_msg_equal
+import pytest
+
+from spacy.attrs import ENT_IOB, ENT_TYPE
+from spacy.lang.en import English
+from spacy.tokenizer import Tokenizer
+from spacy.tokens import Doc
+from spacy.util import compile_infix_regex, compile_prefix_regex
+from spacy.util import compile_suffix_regex, get_lang_class, load_model
+
+from ..util import assert_packed_msg_equal, make_tempdir
 
 
 def load_tokenizer(b):
@@ -12,6 +19,79 @@ def load_tokenizer(b):
     return tok
 
 
+@pytest.mark.issue(2833)
+def test_issue2833(en_vocab):
+    """Test that a custom error is raised if a token or span is pickled."""
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    with pytest.raises(NotImplementedError):
+        pickle.dumps(doc[0])
+    with pytest.raises(NotImplementedError):
+        pickle.dumps(doc[0:2])
+
+
+@pytest.mark.issue(3012)
+def test_issue3012(en_vocab):
+    """Test that the is_tagged attribute doesn't get overwritten when we from_array
+    without tag information."""
+    words = ["This", "is", "10", "%", "."]
+    tags = ["DT", "VBZ", "CD", "NN", "."]
+    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
+    ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
+    assert doc.has_annotation("TAG")
+    expected = ("10", "NUM", "CD", "PERCENT")
+    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
+    header = [ENT_IOB, ENT_TYPE]
+    ent_array = doc.to_array(header)
+    doc.from_array(header, ent_array)
+    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
+    # Serializing then deserializing
+    doc_bytes = doc.to_bytes()
+    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
+    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
+
+
+@pytest.mark.issue(4190)
+def test_issue4190():
+    def customize_tokenizer(nlp):
+        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
+        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
+        infix_re = compile_infix_regex(nlp.Defaults.infixes)
+        # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
+        exceptions = {
+            k: v
+            for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
+            if not (len(k) == 2 and k[1] == ".")
+        }
+        new_tokenizer = Tokenizer(
+            nlp.vocab,
+            exceptions,
+            prefix_search=prefix_re.search,
+            suffix_search=suffix_re.search,
+            infix_finditer=infix_re.finditer,
+            token_match=nlp.tokenizer.token_match,
+        )
+        nlp.tokenizer = new_tokenizer
+
+    test_string = "Test c."
+    # Load default language
+    nlp_1 = English()
+    doc_1a = nlp_1(test_string)
+    result_1a = [token.text for token in doc_1a]  # noqa: F841
+    # Modify tokenizer
+    customize_tokenizer(nlp_1)
+    doc_1b = nlp_1(test_string)
+    result_1b = [token.text for token in doc_1b]
+    # Save and Reload
+    with make_tempdir() as model_dir:
+        nlp_1.to_disk(model_dir)
+        nlp_2 = load_model(model_dir)
+    # This should be the modified tokenizer
+    doc_2 = nlp_2(test_string)
+    result_2 = [token.text for token in doc_2]
+    assert result_1b == result_2
+
+
 def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
     """Test that custom tokenizer with not all functions defined or empty
     properties can be serialized and deserialized correctly (see #2494,
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index ab403ab54..fd80c3d8e 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -1,17 +1,71 @@
-import pytest
 import pickle
+
+import pytest
 from thinc.api import get_current_ops
-from spacy.vocab import Vocab
+
+import spacy
+from spacy.lang.en import English
 from spacy.strings import StringStore
+from spacy.tokens import Doc
+from spacy.util import ensure_path, load_model
 from spacy.vectors import Vectors
+from spacy.vocab import Vocab
 
 from ..util import make_tempdir
 
-
 test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
 test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
 
 
+@pytest.mark.issue(599)
+def test_issue599(en_vocab):
+    doc = Doc(en_vocab)
+    doc2 = Doc(doc.vocab)
+    doc2.from_bytes(doc.to_bytes())
+    assert doc2.has_annotation("DEP")
+
+
+@pytest.mark.issue(4054)
+def test_issue4054(en_vocab):
+    """Test that a new blank model can be made with a vocab from file,
+    and that serialization does not drop the language at any point."""
+    nlp1 = English()
+    vocab1 = nlp1.vocab
+    with make_tempdir() as d:
+        vocab_dir = ensure_path(d / "vocab")
+        if not vocab_dir.exists():
+            vocab_dir.mkdir()
+        vocab1.to_disk(vocab_dir)
+        vocab2 = Vocab().from_disk(vocab_dir)
+        nlp2 = spacy.blank("en", vocab=vocab2)
+        nlp_dir = ensure_path(d / "nlp")
+        if not nlp_dir.exists():
+            nlp_dir.mkdir()
+        nlp2.to_disk(nlp_dir)
+        nlp3 = load_model(nlp_dir)
+        assert nlp3.lang == "en"
+
+
+@pytest.mark.issue(4133)
+def test_issue4133(en_vocab):
+    nlp = English()
+    vocab_bytes = nlp.vocab.to_bytes()
+    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
+    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
+    doc = Doc(en_vocab, words=words)
+    for i, token in enumerate(doc):
+        token.pos_ = pos[i]
+    # usually this is already True when starting from proper models instead of blank English
+    doc_bytes = doc.to_bytes()
+    vocab = Vocab()
+    vocab = vocab.from_bytes(vocab_bytes)
+    doc = Doc(vocab).from_bytes(doc_bytes)
+    actual = []
+    for token in doc:
+        actual.append(token.pos_)
+    assert actual == pos
+
+
 @pytest.mark.parametrize("text", ["rat"])
 def test_serialize_vocab(en_vocab, text):
     text_hash = en_vocab.strings.add(text)
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index c6b00b140..b0862eab6 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,28 +1,103 @@
-import pytest
-from click import NoSuchOption
-from packaging.specifiers import SpecifierSet
-from spacy.training import docs_to_json, offsets_to_biluo_tags
-from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
-from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
-from spacy.lang.nl import Dutch
-from spacy.util import ENV_VARS, load_model_from_config
-from spacy.cli import info
-from spacy.cli.init_config import init_config, RECOMMENDATIONS
-from spacy.cli._util import validate_project_commands, parse_config_overrides
-from spacy.cli._util import load_project_config, substitute_project_variables
-from spacy.cli._util import is_subpath_of
-from spacy.cli._util import string_to_list
-from spacy import about
-from spacy.util import get_minor_version
-from spacy.cli.validate import get_model_pkgs
-from spacy.cli.download import get_compatibility, get_version
-from spacy.cli.package import get_third_party_dependencies
-from thinc.api import ConfigValidationError, Config
-import srsly
 import os
 
-from .util import make_tempdir
+import pytest
+import srsly
+from click import NoSuchOption
+from packaging.specifiers import SpecifierSet
+from thinc.api import Config, ConfigValidationError
+
+from spacy import about
+from spacy.cli import info
+from spacy.cli._util import is_subpath_of, load_project_config
+from spacy.cli._util import parse_config_overrides, string_to_list
+from spacy.cli._util import substitute_project_variables
+from spacy.cli._util import validate_project_commands
+from spacy.cli.download import get_compatibility, get_version
+from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
+from spacy.cli.package import get_third_party_dependencies
+from spacy.cli.validate import get_model_pkgs
+from spacy.lang.en import English
+from spacy.lang.nl import Dutch
+from spacy.language import Language
+from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
+from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
+from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
+from spacy.training.converters import iob_to_docs
+from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
+
 from ..cli.init_pipeline import _init_labels
+from .util import make_tempdir
+
+
+@pytest.mark.issue(4665)
+def test_issue4665():
+    """
+    conllu_to_docs should not raise an exception if the HEAD column contains an
+    underscore
+    """
+    input_data = """
+1	[	_	PUNCT	-LRB-	_	_	punct	_	_
+2	This	_	DET	DT	_	_	det	_	_
+3	killing	_	NOUN	NN	_	_	nsubj	_	_
+4	of	_	ADP	IN	_	_	case	_	_
+5	a	_	DET	DT	_	_	det	_	_
+6	respected	_	ADJ	JJ	_	_	amod	_	_
+7	cleric	_	NOUN	NN	_	_	nmod	_	_
+8	will	_	AUX	MD	_	_	aux	_	_
+9	be	_	AUX	VB	_	_	aux	_	_
+10	causing	_	VERB	VBG	_	_	root	_	_
+11	us	_	PRON	PRP	_	_	iobj	_	_
+12	trouble	_	NOUN	NN	_	_	dobj	_	_
+13	for	_	ADP	IN	_	_	case	_	_
+14	years	_	NOUN	NNS	_	_	nmod	_	_
+15	to	_	PART	TO	_	_	mark	_	_
+16	come	_	VERB	VB	_	_	acl	_	_
+17	.	_	PUNCT	.	_	_	punct	_	_
+18	]	_	PUNCT	-RRB-	_	_	punct	_	_
+"""
+    conllu_to_docs(input_data)
+
+
+@pytest.mark.issue(4924)
+def test_issue4924():
+    nlp = Language()
+    example = Example.from_dict(nlp.make_doc(""), {})
+    nlp.evaluate([example])
+
+
+@pytest.mark.issue(7055)
+def test_issue7055():
+    """Test that fill-config doesn't turn sourced components into factories."""
+    source_cfg = {
+        "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]},
+        "components": {
+            "tok2vec": {"factory": "tok2vec"},
+            "tagger": {"factory": "tagger"},
+        },
+    }
+    source_nlp = English.from_config(source_cfg)
+    with make_tempdir() as dir_path:
+        # We need to create a loadable source pipeline
+        source_path = dir_path / "test_model"
+        source_nlp.to_disk(source_path)
+        base_cfg = {
+            "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
+            "components": {
+                "tok2vec": {"source": str(source_path)},
+                "tagger": {"source": str(source_path)},
+                "ner": {"factory": "ner"},
+            },
+        }
+        base_cfg = Config(base_cfg)
+        base_path = dir_path / "base.cfg"
+        base_cfg.to_disk(base_path)
+        output_path = dir_path / "config.cfg"
+        fill_config(output_path, base_path, silent=True)
+        filled_cfg = load_config(output_path)
+    assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path)
+    assert filled_cfg["components"]["tagger"]["source"] == str(source_path)
+    assert filled_cfg["components"]["ner"]["factory"] == "ner"
+    assert "model" in filled_cfg["components"]["ner"]
 
 
 def test_cli_info():
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index 790925888..392c95e42 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -1,11 +1,101 @@
+import numpy
 import pytest
 
 from spacy import displacy
 from spacy.displacy.render import DependencyRenderer, EntityRenderer
+from spacy.lang.en import English
 from spacy.lang.fa import Persian
 from spacy.tokens import Span, Doc
 
 
+@pytest.mark.issue(2361)
+def test_issue2361(de_vocab):
+    """Test if < is escaped when rendering"""
+    chars = ("&lt;", "&gt;", "&amp;", "&quot;")
+    words = ["<", ">", "&", '"']
+    doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
+    html = displacy.render(doc)
+    for char in chars:
+        assert char in html
+
+
+@pytest.mark.issue(2728)
+def test_issue2728(en_vocab):
+    """Test that displaCy ENT visualizer escapes HTML correctly."""
+    doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
+    doc.ents = [Span(doc, 0, 1, label="TEST")]
+    html = displacy.render(doc, style="ent")
+    assert "&lt;RELEASE&gt;" in html
+    doc.ents = [Span(doc, 1, 2, label="TEST")]
+    html = displacy.render(doc, style="ent")
+    assert "&lt;RELEASE&gt;" in html
+
+
+@pytest.mark.issue(3288)
+def test_issue3288(en_vocab):
+    """Test that retokenization works correctly via displaCy when punctuation
+    is merged onto the preceeding token and tensor is resized."""
+    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
+    heads = [1, 1, 1, 4, 4, 6, 4, 4]
+    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
+    displacy.render(doc)
+
+
+@pytest.mark.issue(3531)
+def test_issue3531():
+    """Test that displaCy renderer doesn't require "settings" key."""
+    example_dep = {
+        "words": [
+            {"text": "But", "tag": "CCONJ"},
+            {"text": "Google", "tag": "PROPN"},
+            {"text": "is", "tag": "VERB"},
+            {"text": "starting", "tag": "VERB"},
+            {"text": "from", "tag": "ADP"},
+            {"text": "behind.", "tag": "ADV"},
+        ],
+        "arcs": [
+            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
+            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
+            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
+            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
+            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
+        ],
+    }
+    example_ent = {
+        "text": "But Google is starting from behind.",
+        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
+    }
+    dep_html = displacy.render(example_dep, style="dep", manual=True)
+    assert dep_html
+    ent_html = displacy.render(example_ent, style="ent", manual=True)
+    assert ent_html
+
+
+@pytest.mark.issue(3882)
+def test_issue3882(en_vocab):
+    """Test that displaCy doesn't serialize the doc.user_data when making a
+    copy of the Doc.
+    """
+    doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
+    doc.user_data["test"] = set()
+    displacy.parse_deps(doc)
+
+
+@pytest.mark.issue(5838)
+def test_issue5838():
+    # Displacy's EntityRenderer break line
+    # not working after last entity
+    sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n"
+    nlp = English()
+    doc = nlp(sample_text)
+    doc.ents = [Span(doc, 7, 8, label="test")]
+    html = displacy.render(doc, style="ent")
+    found = html.count("</br>")
+    assert found == 4
+
+
 def test_displacy_parse_ents(en_vocab):
     """Test that named entities on a Doc are converted into displaCy's format."""
     doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index f17d5e62e..d8743d322 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -15,7 +15,8 @@ from spacy.training.batchers import minibatch_by_words
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
 from spacy.language import DEFAULT_CONFIG_PATH
-from spacy.schemas import ConfigSchemaTraining
+from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema
+from pydantic import ValidationError
 
 from thinc.api import get_current_ops, NumpyOps, CupyOps
 
@@ -33,6 +34,32 @@ def is_admin():
     return admin
 
 
+@pytest.mark.issue(6207)
+def test_issue6207(en_tokenizer):
+    doc = en_tokenizer("zero one two three four five six")
+
+    # Make spans
+    s1 = doc[:4]
+    s2 = doc[3:6]  # overlaps with s1
+    s3 = doc[5:7]  # overlaps with s2, not s1
+
+    result = util.filter_spans((s1, s2, s3))
+    assert s1 in result
+    assert s2 not in result
+    assert s3 in result
+
+
+@pytest.mark.issue(6258)
+def test_issue6258():
+    """Test that the non-empty constraint pattern field is respected"""
+    # These one is valid
+    TokenPatternSchema(pattern=[TokenPattern()])
+    # But an empty pattern list should fail to validate
+    # based on the schema's constraint
+    with pytest.raises(ValidationError):
+        TokenPatternSchema(pattern=[])
+
+
 @pytest.mark.parametrize("text", ["hello/world", "hello world"])
 def test_util_ensure_path_succeeds(text):
     path = util.ensure_path(text)
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 452bcc079..c2aeffcb5 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -1,9 +1,283 @@
-import pytest
 import re
-from spacy.vocab import Vocab
-from spacy.tokenizer import Tokenizer
-from spacy.util import ensure_path, compile_prefix_regex, compile_suffix_regex
+
+import numpy
+import pytest
+
 from spacy.lang.en import English
+from spacy.lang.de import German
+from spacy.tokenizer import Tokenizer
+from spacy.tokens import Doc
+from spacy.training import Example
+from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
+from spacy.vocab import Vocab
+from spacy.symbols import ORTH
+
+
+@pytest.mark.issue(743)
+def test_issue743():
+    doc = Doc(Vocab(), ["hello", "world"])
+    token = doc[0]
+    s = set([token])
+    items = list(s)
+    assert items[0] is token
+
+
+@pytest.mark.issue(801)
+@pytest.mark.skip(
+    reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218"
+)
+@pytest.mark.parametrize(
+    "text,tokens",
+    [
+        ('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
+        ("exception;--exclusive", ["exception", ";--", "exclusive"]),
+        ("day.--Is", ["day", ".--", "Is"]),
+        ("refinement:--just", ["refinement", ":--", "just"]),
+        ("memories?--To", ["memories", "?--", "To"]),
+        ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
+        ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]),
+    ],
+)
+def test_issue801(en_tokenizer, text, tokens):
+    """Test that special characters + hyphens are split correctly."""
+    doc = en_tokenizer(text)
+    assert len(doc) == len(tokens)
+    assert [t.text for t in doc] == tokens
+
+
+@pytest.mark.issue(1061)
+def test_issue1061():
+    """Test special-case works after tokenizing. Was caching problem."""
+    text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
+    tokenizer = English().tokenizer
+    doc = tokenizer(text)
+    assert "MATH" in [w.text for w in doc]
+    assert "_MATH_" not in [w.text for w in doc]
+
+    tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
+    doc = tokenizer(text)
+    assert "_MATH_" in [w.text for w in doc]
+    assert "MATH" not in [w.text for w in doc]
+
+    # For sanity, check it works when pipeline is clean.
+    tokenizer = English().tokenizer
+    tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
+    doc = tokenizer(text)
+    assert "_MATH_" in [w.text for w in doc]
+    assert "MATH" not in [w.text for w in doc]
+
+
+@pytest.mark.issue(1963)
+def test_issue1963(en_tokenizer):
+    """Test that doc.merge() resizes doc.tensor"""
+    doc = en_tokenizer("a b c d")
+    doc.tensor = numpy.ones((len(doc), 128), dtype="f")
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[0:2])
+    assert len(doc) == 3
+    assert doc.tensor.shape == (3, 128)
+
+
+@pytest.mark.skip(
+    reason="Can not be fixed without variable-width look-behind (which we don't want)"
+)
+@pytest.mark.issue(1235)
+def test_issue1235():
+    """Test that g is not split of if preceded by a number and a letter"""
+    nlp = English()
+    testwords = "e2g 2g 52g"
+    doc = nlp(testwords)
+    assert len(doc) == 5
+    assert doc[0].text == "e2g"
+    assert doc[1].text == "2"
+    assert doc[2].text == "g"
+    assert doc[3].text == "52"
+    assert doc[4].text == "g"
+
+
+@pytest.mark.issue(1242)
+def test_issue1242():
+    nlp = English()
+    doc = nlp("")
+    assert len(doc) == 0
+    docs = list(nlp.pipe(["", "hello"]))
+    assert len(docs[0]) == 0
+    assert len(docs[1]) == 1
+
+
+@pytest.mark.issue(1257)
+def test_issue1257():
+    """Test that tokens compare correctly."""
+    doc1 = Doc(Vocab(), words=["a", "b", "c"])
+    doc2 = Doc(Vocab(), words=["a", "c", "e"])
+    assert doc1[0] != doc2[0]
+    assert not doc1[0] == doc2[0]
+
+
+@pytest.mark.issue(1375)
+def test_issue1375():
+    """Test that token.nbor() raises IndexError for out-of-bounds access."""
+    doc = Doc(Vocab(), words=["0", "1", "2"])
+    with pytest.raises(IndexError):
+        assert doc[0].nbor(-1)
+    assert doc[1].nbor(-1).text == "0"
+    with pytest.raises(IndexError):
+        assert doc[2].nbor(1)
+    assert doc[1].nbor(1).text == "2"
+
+
+@pytest.mark.issue(1488)
+def test_issue1488():
+    """Test that tokenizer can parse DOT inside non-whitespace separators"""
+    prefix_re = re.compile(r"""[\[\("']""")
+    suffix_re = re.compile(r"""[\]\)"']""")
+    infix_re = re.compile(r"""[-~\.]""")
+    simple_url_re = re.compile(r"""^https?://""")
+
+    def my_tokenizer(nlp):
+        return Tokenizer(
+            nlp.vocab,
+            {},
+            prefix_search=prefix_re.search,
+            suffix_search=suffix_re.search,
+            infix_finditer=infix_re.finditer,
+            token_match=simple_url_re.match,
+        )
+
+    nlp = English()
+    nlp.tokenizer = my_tokenizer(nlp)
+    doc = nlp("This is a test.")
+    for token in doc:
+        assert token.text
+
+
+@pytest.mark.issue(1494)
+def test_issue1494():
+    """Test if infix_finditer works correctly"""
+    infix_re = re.compile(r"""[^a-z]""")
+    test_cases = [
+        ("token 123test", ["token", "1", "2", "3", "test"]),
+        ("token 1test", ["token", "1test"]),
+        ("hello...test", ["hello", ".", ".", ".", "test"]),
+    ]
+
+    def new_tokenizer(nlp):
+        return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
+
+    nlp = English()
+    nlp.tokenizer = new_tokenizer(nlp)
+    for text, expected in test_cases:
+        assert [token.text for token in nlp(text)] == expected
+
+
+@pytest.mark.skip(
+    reason="Can not be fixed without iterative looping between prefix/suffix and infix"
+)
+@pytest.mark.issue(2070)
+def test_issue2070():
+    """Test that checks that a dot followed by a quote is handled
+    appropriately.
+    """
+    # Problem: The dot is now properly split off, but the prefix/suffix rules
+    # are not applied again afterwards. This means that the quote will still be
+    # attached to the remaining token.
+    nlp = English()
+    doc = nlp('First sentence."A quoted sentence" he said ...')
+    assert len(doc) == 11
+
+
+@pytest.mark.issue(2926)
+def test_issue2926(fr_tokenizer):
+    """Test that the tokenizer correctly splits tokens separated by a slash (/)
+    ending in a digit.
+    """
+    doc = fr_tokenizer("Learn html5/css3/javascript/jquery")
+    assert len(doc) == 8
+    assert doc[0].text == "Learn"
+    assert doc[1].text == "html5"
+    assert doc[2].text == "/"
+    assert doc[3].text == "css3"
+    assert doc[4].text == "/"
+    assert doc[5].text == "javascript"
+    assert doc[6].text == "/"
+    assert doc[7].text == "jquery"
+
+
+@pytest.mark.parametrize(
+    "text",
+    [
+        "ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume",
+        "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:",
+    ],
+)
+@pytest.mark.issue(2626)
+def test_issue2626_2835(en_tokenizer, text):
+    """Check that sentence doesn't cause an infinite loop in the tokenizer."""
+    doc = en_tokenizer(text)
+    assert doc
+
+
+@pytest.mark.issue(2656)
+def test_issue2656(en_tokenizer):
+    """Test that tokenizer correctly splits off punctuation after numbers with
+    decimal points.
+    """
+    doc = en_tokenizer("I went for 40.3, and got home by 10.0.")
+    assert len(doc) == 11
+    assert doc[0].text == "I"
+    assert doc[1].text == "went"
+    assert doc[2].text == "for"
+    assert doc[3].text == "40.3"
+    assert doc[4].text == ","
+    assert doc[5].text == "and"
+    assert doc[6].text == "got"
+    assert doc[7].text == "home"
+    assert doc[8].text == "by"
+    assert doc[9].text == "10.0"
+    assert doc[10].text == "."
+
+
+@pytest.mark.issue(2754)
+def test_issue2754(en_tokenizer):
+    """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
+    a = en_tokenizer("a")
+    assert a[0].norm_ == "a"
+    am = en_tokenizer("am")
+    assert am[0].norm_ == "am"
+
+
+@pytest.mark.issue(3002)
+def test_issue3002():
+    """Test that the tokenizer doesn't hang on a long list of dots"""
+    nlp = German()
+    doc = nlp(
+        "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
+    )
+    assert len(doc) == 5
+
+
+@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
+@pytest.mark.issue(3449)
+def test_issue3449():
+    nlp = English()
+    nlp.add_pipe("sentencizer")
+    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
+    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
+    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
+    t1 = nlp(text1)
+    t2 = nlp(text2)
+    t3 = nlp(text3)
+    assert t1[5].text == "I"
+    assert t2[5].text == "I"
+    assert t3[5].text == "I"
+
+
+@pytest.mark.parametrize(
+    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
+)
+def test_gold_misaligned(en_tokenizer, text, words):
+    doc = en_tokenizer(text)
+    Example.from_dict(doc, {"words": words})
 
 
 def test_tokenizer_handles_no_word(tokenizer):
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 68f86190b..0d73300d8 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -1,15 +1,18 @@
+import random
+
 import numpy
-from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
-from spacy.training import biluo_tags_to_spans, iob_to_biluo
-from spacy.training import Corpus, docs_to_json, Example
-from spacy.training.align import get_alignments
-from spacy.training.converters import json_to_docs
-from spacy.lang.en import English
-from spacy.tokens import Doc, DocBin
-from spacy.util import get_words_and_spaces, minibatch
-from thinc.api import compounding
 import pytest
 import srsly
+from spacy.lang.en import English
+from spacy.tokens import Doc, DocBin
+from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
+from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
+from spacy.training import offsets_to_biluo_tags
+from spacy.training.align import get_alignments
+from spacy.training.converters import json_to_docs
+from spacy.util import get_words_and_spaces, load_model_from_path, minibatch
+from spacy.util import load_config_from_str
+from thinc.api import compounding
 
 from ..util import make_tempdir
 
@@ -68,6 +71,207 @@ def vocab():
     return nlp.vocab
 
 
+@pytest.mark.issue(999)
+def test_issue999():
+    """Test that adding entities and resuming training works passably OK.
+    There are two issues here:
+    1) We have to re-add labels. This isn't very nice.
+    2) There's no way to set the learning rate for the weight update, so we
+        end up out-of-scale, causing it to learn too fast.
+    """
+    TRAIN_DATA = [
+        ["hey", []],
+        ["howdy", []],
+        ["hey there", []],
+        ["hello", []],
+        ["hi", []],
+        ["i'm looking for a place to eat", []],
+        ["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]],
+        ["show me chinese restaurants", [(8, 15, "CUISINE")]],
+        ["show me chines restaurants", [(8, 14, "CUISINE")]],
+    ]
+    nlp = English()
+    ner = nlp.add_pipe("ner")
+    for _, offsets in TRAIN_DATA:
+        for start, end, label in offsets:
+            ner.add_label(label)
+    nlp.initialize()
+    for itn in range(20):
+        random.shuffle(TRAIN_DATA)
+        for raw_text, entity_offsets in TRAIN_DATA:
+            example = Example.from_dict(
+                nlp.make_doc(raw_text), {"entities": entity_offsets}
+            )
+            nlp.update([example])
+
+    with make_tempdir() as model_dir:
+        nlp.to_disk(model_dir)
+        nlp2 = load_model_from_path(model_dir)
+
+    for raw_text, entity_offsets in TRAIN_DATA:
+        doc = nlp2(raw_text)
+        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
+        for start, end, label in entity_offsets:
+            if (start, end) in ents:
+                assert ents[(start, end)] == label
+                break
+            else:
+                if entity_offsets:
+                    raise Exception(ents)
+
+
+@pytest.mark.issue(4402)
+def test_issue4402():
+    json_data = {
+        "id": 0,
+        "paragraphs": [
+            {
+                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
+                "sentences": [
+                    {
+                        "tokens": [
+                            {"id": 0, "orth": "How", "ner": "O"},
+                            {"id": 1, "orth": "should", "ner": "O"},
+                            {"id": 2, "orth": "I", "ner": "O"},
+                            {"id": 3, "orth": "cook", "ner": "O"},
+                            {"id": 4, "orth": "bacon", "ner": "O"},
+                            {"id": 5, "orth": "in", "ner": "O"},
+                            {"id": 6, "orth": "an", "ner": "O"},
+                            {"id": 7, "orth": "oven", "ner": "O"},
+                            {"id": 8, "orth": "?", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                    {
+                        "tokens": [
+                            {"id": 9, "orth": "\n", "ner": "O"},
+                            {"id": 10, "orth": "I", "ner": "O"},
+                            {"id": 11, "orth": "'ve", "ner": "O"},
+                            {"id": 12, "orth": "heard", "ner": "O"},
+                            {"id": 13, "orth": "of", "ner": "O"},
+                            {"id": 14, "orth": "people", "ner": "O"},
+                            {"id": 15, "orth": "cooking", "ner": "O"},
+                            {"id": 16, "orth": "bacon", "ner": "O"},
+                            {"id": 17, "orth": "in", "ner": "O"},
+                            {"id": 18, "orth": "an", "ner": "O"},
+                            {"id": 19, "orth": "oven", "ner": "O"},
+                            {"id": 20, "orth": ".", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                ],
+                "cats": [
+                    {"label": "baking", "value": 1.0},
+                    {"label": "not_baking", "value": 0.0},
+                ],
+            },
+            {
+                "raw": "What is the difference between white and brown eggs?\n",
+                "sentences": [
+                    {
+                        "tokens": [
+                            {"id": 0, "orth": "What", "ner": "O"},
+                            {"id": 1, "orth": "is", "ner": "O"},
+                            {"id": 2, "orth": "the", "ner": "O"},
+                            {"id": 3, "orth": "difference", "ner": "O"},
+                            {"id": 4, "orth": "between", "ner": "O"},
+                            {"id": 5, "orth": "white", "ner": "O"},
+                            {"id": 6, "orth": "and", "ner": "O"},
+                            {"id": 7, "orth": "brown", "ner": "O"},
+                            {"id": 8, "orth": "eggs", "ner": "O"},
+                            {"id": 9, "orth": "?", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
+                ],
+                "cats": [
+                    {"label": "baking", "value": 0.0},
+                    {"label": "not_baking", "value": 1.0},
+                ],
+            },
+        ],
+    }
+    nlp = English()
+    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
+    with make_tempdir() as tmpdir:
+        output_file = tmpdir / "test4402.spacy"
+        docs = json_to_docs([json_data])
+        data = DocBin(docs=docs, attrs=attrs).to_bytes()
+        with output_file.open("wb") as file_:
+            file_.write(data)
+        reader = Corpus(output_file)
+        train_data = list(reader(nlp))
+        assert len(train_data) == 2
+
+        split_train_data = []
+        for eg in train_data:
+            split_train_data.extend(eg.split_sents())
+        assert len(split_train_data) == 4
+
+
+CONFIG_7029 = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tok2vec.model.encode:width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+upstream = "*"
+"""
+
+
+@pytest.mark.issue(7029)
+def test_issue7029():
+    """Test that an empty document doesn't mess up an entire batch."""
+    TRAIN_DATA = [
+        ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+        ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+    ]
+    nlp = English.from_config(load_config_from_str(CONFIG_7029))
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
+    docs1 = list(nlp.pipe(texts, batch_size=1))
+    docs2 = list(nlp.pipe(texts, batch_size=4))
+    assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
+
+
 def test_gold_biluo_U(en_vocab):
     words = ["I", "flew", "to", "London", "."]
     spaces = [True, True, True, False, True]
diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py
index b6fee6628..d91f41db3 100644
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@@ -1,7 +1,25 @@
-import pytest
 import numpy
+import pytest
 from spacy.attrs import IS_ALPHA, IS_DIGIT
+from spacy.lookups import Lookups
+from spacy.tokens import Doc
 from spacy.util import OOV_RANK
+from spacy.vocab import Vocab
+
+
+@pytest.mark.issue(361)
+@pytest.mark.parametrize("text1,text2", [("cat", "dog")])
+def test_issue361(en_vocab, text1, text2):
+    """Test Issue #361: Equality of lexemes"""
+    assert en_vocab[text1] == en_vocab[text1]
+    assert en_vocab[text1] != en_vocab[text2]
+
+
+@pytest.mark.issue(600)
+def test_issue600():
+    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
+    doc = Doc(vocab, words=["hello"])
+    doc[0].tag_ = "NN"
 
 
 @pytest.mark.parametrize("text1,prob1,text2,prob2", [("NOUN", -1, "opera", -2)])
diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py
index b5f7303b5..3b9308f4d 100644
--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@@ -16,6 +16,16 @@ def vocab(en_vocab, vectors):
     return en_vocab
 
 
+@pytest.mark.issue(2219)
+def test_issue2219(en_vocab):
+    """Test if indexing issue still occurs during Token-Token similarity"""
+    vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
+    add_vecs_to_vocab(en_vocab, vectors)
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc = Doc(en_vocab, words=[word1, word2])
+    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
+
+
 def test_vectors_similarity_LL(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     lex1 = vocab[word1]
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index f2e74c3c9..9dc40b499 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -1,14 +1,15 @@
-import pytest
 import numpy
-from numpy.testing import assert_allclose, assert_equal, assert_almost_equal
-from thinc.api import get_current_ops
+import pytest
+from numpy.testing import assert_allclose, assert_almost_equal, assert_equal
+from thinc.api import NumpyOps, get_current_ops
+
 from spacy.lang.en import English
-from spacy.vocab import Vocab
-from spacy.vectors import Vectors
-from spacy.tokenizer import Tokenizer
 from spacy.strings import hash_string  # type: ignore
+from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
 from spacy.training.initialize import convert_vectors
+from spacy.vectors import Vectors
+from spacy.vocab import Vocab
 
 from ..util import add_vecs_to_vocab, get_cosine, make_tempdir
 
@@ -65,6 +66,79 @@ def tokenizer_v(vocab):
     return Tokenizer(vocab, {}, None, None, None)
 
 
+@pytest.mark.issue(1518)
+def test_issue1518():
+    """Test vectors.resize() works."""
+    vectors = Vectors(shape=(10, 10))
+    vectors.add("hello", row=2)
+    vectors.resize((5, 9))
+
+
+@pytest.mark.issue(1539)
+def test_issue1539():
+    """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
+    v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
+    v.resize((100, 100))
+
+
+@pytest.mark.issue(1807)
+def test_issue1807():
+    """Test vocab.set_vector also adds the word to the vocab."""
+    vocab = Vocab(vectors_name="test_issue1807")
+    assert "hello" not in vocab
+    vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
+    assert "hello" in vocab
+
+
+@pytest.mark.issue(2871)
+def test_issue2871():
+    """Test that vectors recover the correct key for spaCy reserved words."""
+    words = ["dog", "cat", "SUFFIX"]
+    vocab = Vocab(vectors_name="test_issue2871")
+    vocab.vectors.resize(shape=(3, 10))
+    vector_data = numpy.zeros((3, 10), dtype="f")
+    for word in words:
+        _ = vocab[word]  # noqa: F841
+        vocab.set_vector(word, vector_data[0])
+    vocab.vectors.name = "dummy_vectors"
+    assert vocab["dog"].rank == 0
+    assert vocab["cat"].rank == 1
+    assert vocab["SUFFIX"].rank == 2
+    assert vocab.vectors.find(key="dog") == 0
+    assert vocab.vectors.find(key="cat") == 1
+    assert vocab.vectors.find(key="SUFFIX") == 2
+
+
+@pytest.mark.issue(3412)
+def test_issue3412():
+    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
+    vectors = Vectors(data=data, keys=["A", "B", "C"])
+    keys, best_rows, scores = vectors.most_similar(
+        numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
+    )
+    assert best_rows[0] == 2
+
+
+@pytest.mark.issue(4725)
+def test_issue4725_2():
+    if isinstance(get_current_ops, NumpyOps):
+        # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+        # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
+        # or because of issues with pickling the NER (cf test_issue4725_1)
+        vocab = Vocab(vectors_name="test_vocab_add_vector")
+        data = numpy.ndarray((5, 3), dtype="f")
+        data[0] = 1.0
+        data[1] = 2.0
+        vocab.set_vector("cat", data[0])
+        vocab.set_vector("dog", data[1])
+        nlp = English(vocab=vocab)
+        nlp.add_pipe("ner")
+        nlp.initialize()
+        docs = ["Kurt is in London."] * 10
+        for _ in nlp.pipe(docs, batch_size=2, n_process=2):
+            pass
+
+
 def test_init_vectors_with_resize_shape(strings, resize_data):
     v = Vectors(shape=(len(strings), 3))
     v.resize(shape=resize_data.shape)
diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py
index 56ef1d108..16cf80a08 100644
--- a/spacy/tests/vocab_vectors/test_vocab_api.py
+++ b/spacy/tests/vocab_vectors/test_vocab_api.py
@@ -1,6 +1,19 @@
 import pytest
-from spacy.attrs import LEMMA, ORTH, IS_ALPHA
+from spacy.attrs import IS_ALPHA, LEMMA, ORTH
 from spacy.parts_of_speech import NOUN, VERB
+from spacy.vocab import Vocab
+
+
+@pytest.mark.issue(1868)
+def test_issue1868():
+    """Test Vocab.__contains__ works with int keys."""
+    vocab = Vocab()
+    lex = vocab["hello"]
+    assert lex.orth in vocab
+    assert lex.orth_ in vocab
+    assert "some string" not in vocab
+    int_id = vocab.strings.add("some string")
+    assert int_id not in vocab
 
 
 @pytest.mark.parametrize(

From 472740d613675be8d3055e6e0a7e59b76d76a551 Mon Sep 17 00:00:00 2001
From: Natalia Rodnova <4512370+nrodnova@users.noreply.github.com>
Date: Mon, 6 Dec 2021 01:58:01 -0700
Subject: [PATCH 093/133] Added sents property to Span for Spans spanning over
 several sentences (#9699)

* Added sents property to Span class that returns a generator of sentences the Span belongs to

* Added description to Span.sents property

* Update test_span to clarify the difference between span.sent and span.sents

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/tests/doc/test_span.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix documentation typos in spacy/tokens/span.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update Span.sents doc string in spacy/tokens/span.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Parametrized test_span_spans

* Corrected Span.sents to check for span-level hook first. Also, made Span.sent respect doc-level sents hook if no span-level hook is provided

* Corrected Span ocumentation copy/paste issue

* Put back accidentally deleted lines

* Fixed formatting in span.pyx

* Moved check for SENT_START annotation after user hooks in Span.sents

* add version where the property was introduced

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/doc/test_span.py | 70 ++++++++++++++++++++++++++++++++++++
 spacy/tokens/span.pyx        | 45 +++++++++++++++++++++++
 website/docs/api/span.md     | 21 +++++++++++
 3 files changed, 136 insertions(+)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index d18293d3f..10aba5b94 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -200,6 +200,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
     assert doc[:2].sent.root.text == "is"
     assert doc[:2].sent.text == "This is a sentence."
     assert doc[6:7].sent.root.left_edge.text == "This"
+    assert doc[0 : len(doc)].sent == list(doc.sents)[0]
+    assert list(doc[0 : len(doc)].sents) == list(doc.sents)
+
+    with pytest.raises(ValueError):
+        doc_not_parsed[:2].sent
+
     # test on manual sbd
     doc_not_parsed[0].is_sent_start = True
     doc_not_parsed[5].is_sent_start = True
@@ -207,6 +213,35 @@ def test_spans_span_sent(doc, doc_not_parsed):
     assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
 
 
+@pytest.mark.parametrize(
+    "start,end,expected_sentence",
+    [
+        (0, 14, "This is"),  # Entire doc
+        (1, 4, "This is"),  # Overlapping with 2 sentences
+        (0, 2, "This is"),  # Beginning of the Doc. Full sentence
+        (0, 1, "This is"),  # Beginning of the Doc. Part of a sentence
+        (10, 14, "And a"),  # End of the Doc. Overlapping with 2 senteces
+        (12, 14, "third."),  # End of the Doc. Full sentence
+        (1, 1, "This is"),  # Empty Span
+    ],
+)
+def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence):
+
+    # Doc-level sents hook
+    def user_hook(doc):
+        return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
+
+    doc.user_hooks["sents"] = user_hook
+
+    # Make sure doc-level sents hook works
+    assert doc[start:end].sent.text == expected_sentence
+
+    # Span-level sent hook
+    doc.user_span_hooks["sent"] = lambda x: x
+    # Now, span=level sent hook overrides the doc-level sents hook
+    assert doc[start:end].sent == doc[start:end]
+
+
 def test_spans_lca_matrix(en_tokenizer):
     """Test span's lca matrix generation"""
     tokens = en_tokenizer("the lazy dog slept")
@@ -536,3 +571,38 @@ def test_span_with_vectors(doc):
     # single-token span with vector
     assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1])
     doc.vocab.vectors = prev_vectors
+
+
+@pytest.mark.parametrize(
+    "start,end,expected_sentences,expected_sentences_with_hook",
+    [
+        (0, 14, 3, 7),  # Entire doc
+        (3, 6, 2, 2),  # Overlapping with 2 sentences
+        (0, 4, 1, 2),  # Beginning of the Doc. Full sentence
+        (0, 3, 1, 2),  # Beginning of the Doc. Part of a sentence
+        (9, 14, 2, 3),  # End of the Doc. Overlapping with 2 senteces
+        (10, 14, 1, 2),  # End of the Doc. Full sentence
+        (11, 14, 1, 2),  # End of the Doc. Partial sentence
+        (0, 0, 1, 1),  # Empty Span
+    ],
+)
+def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook):
+
+    assert len(list(doc[start:end].sents)) == expected_sentences
+
+    def user_hook(doc):
+        return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
+
+    doc.user_hooks["sents"] = user_hook
+
+    assert len(list(doc[start:end].sents)) == expected_sentences_with_hook
+
+    doc.user_span_hooks["sents"] = lambda x: [x]
+
+    assert list(doc[start:end].sents)[0] == doc[start:end]
+    assert len(list(doc[start:end].sents)) == 1
+
+
+def test_span_sents_not_parsed(doc_not_parsed):
+    with pytest.raises(ValueError):
+        list(Span(doc_not_parsed, 0, 3).sents)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 96f843a33..cd02cab36 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -404,6 +404,10 @@ cdef class Span:
         """
         if "sent" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["sent"](self)
+        elif "sents" in self.doc.user_hooks:
+            for sentence in self.doc.user_hooks["sents"](self.doc):
+                if sentence.start <= self.start < sentence.end:
+                    return sentence
         # Use `sent_start` token attribute to find sentence boundaries
         cdef int n = 0
         if self.doc.has_annotation("SENT_START"):
@@ -422,6 +426,47 @@ cdef class Span:
         else:
             raise ValueError(Errors.E030)
 
+    @property
+    def sents(self):
+        """Obtain the sentences that contain this span. If the given span
+        crosses sentence boundaries, return all sentences it is a part of.
+
+        RETURNS (Iterable[Span]): All sentences that the span is a part of.
+
+         DOCS: https://spacy.io/api/span#sents
+        """
+        cdef int start
+        cdef int i
+
+        if "sents" in self.doc.user_span_hooks:
+            yield from self.doc.user_span_hooks["sents"](self)
+        elif "sents" in self.doc.user_hooks:
+            for sentence in self.doc.user_hooks["sents"](self.doc):
+                if sentence.end > self.start:
+                    if sentence.start < self.end or sentence.start == self.start == self.end:
+                        yield sentence
+                    else:
+                        break
+        else:
+            if not self.doc.has_annotation("SENT_START"):
+                raise ValueError(Errors.E030)
+            # Use `sent_start` token attribute to find sentence boundaries
+            # Find start of the 1st sentence of the Span
+            start = self.start
+            while self.doc.c[start].sent_start != 1 and start > 0:
+                start -= 1
+
+            # Now, find all the sentences in the span
+            for i in range(start + 1, self.doc.length):
+                if self.doc.c[i].sent_start == 1:
+                    yield Span(self.doc, start, i)
+                    start = i
+                    if start >= self.end:
+                        break
+            if start < self.end:
+                yield Span(self.doc, start, self.end)
+
+
     @property
     def ents(self):
         """The named entities in the span. Returns a tuple of named entity
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index 2938b4253..7ecebf93e 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -518,6 +518,27 @@ sent = doc[sent.start : max(sent.end, span.end)]
 | ----------- | ------------------------------------------------------- |
 | **RETURNS** | The sentence span that this span is a part of. ~~Span~~ |
 
+## Span.sents {#sents tag="property" model="sentences" new="3.2.1"}
+
+Returns a generator over the sentences the span belongs to. This property is only available
+when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
+document by the `parser`, `senter`, `sentencizer` or some custom function. It
+will raise an error otherwise.
+
+If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Give it back! He pleaded.")
+> span = doc[2:4]
+> assert len(span.sents) == 2
+> ```
+
+| Name        | Description                                                                |
+| ----------- | -------------------------------------------------------------------------- |
+| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
+
 ## Attributes {#attributes}
 
 | Name                                    | Description                                                                                                                   |

From b56b9e7f31956edd59b5dbd676c7aa1b6e1048b0 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu.altinok12@gmail.com>
Date: Mon, 6 Dec 2021 15:32:49 +0100
Subject: [PATCH 094/133] Entity ruler remove pattern (#9685)

* added ruler coe

* added error for none existing pattern

* changed error to warning

* changed error to warning

* added basic tests

* fixed place

* added test files

* went back to error

* went back to pattern error

* minor change to docs

* changed style

* changed doc

* changed error slightly

* added remove to phrasem api

* error key already existed

* phrase matcher match code to api

* blacked tests

* moved comments before expr

* corrected error no

* Update website/docs/api/entityruler.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/entityruler.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py                           |   2 +
 spacy/matcher/phrasematcher.pyi           |  16 +-
 spacy/pipeline/entityruler.py             |  40 +++++
 spacy/tests/pipeline/test_entity_ruler.py | 182 ++++++++++++++++++++++
 website/docs/api/entityruler.md           |  18 +++
 5 files changed, 252 insertions(+), 6 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index c5e364013..ad7a0280f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -889,6 +889,8 @@ class Errors(metaclass=ErrorsWithCodes):
              "Non-UD tags should use the `tag` property.")
     E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
     E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
+    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
+    
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index d73633ec0..741bf7bb6 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -8,12 +8,9 @@ class PhraseMatcher:
     def __init__(
         self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ...
     ) -> None: ...
-    def __call__(
-        self,
-        doclike: Union[Doc, Span],
-        *,
-        as_spans: bool = ...,
-    ) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
+    def __reduce__(self) -> Any: ...
+    def __len__(self) -> int: ...
+    def __contains__(self, key: str) -> bool: ...
     def add(
         self,
         key: str,
@@ -23,3 +20,10 @@ class PhraseMatcher:
             Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
         ] = ...,
     ) -> None: ...
+    def remove(self, key: str) -> None: ...
+    def __call__(
+        self,
+        doclike: Union[Doc, Span],
+        *,
+        as_spans: bool = ...,
+    ) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 78d7a0be2..614d71f41 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -348,6 +348,46 @@ class EntityRuler(Pipe):
             self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
         )
 
+    def remove(self, ent_id: str) -> None:
+        """Remove a pattern by its ent_id if a pattern with this ent_id was added before
+
+        ent_id (str): id of the pattern to be removed
+        RETURNS: None
+        DOCS: https://spacy.io/api/entityruler#remove
+        """
+        label_id_pairs = [
+            (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
+        ]
+        if not label_id_pairs:
+            raise ValueError(Errors.E1024.format(ent_id=ent_id))
+        created_labels = [
+            self._create_label(label, eid) for (label, eid) in label_id_pairs
+        ]
+        # remove the patterns from self.phrase_patterns
+        self.phrase_patterns = defaultdict(
+            list,
+            {
+                label: val
+                for (label, val) in self.phrase_patterns.items()
+                if label not in created_labels
+            },
+        )
+        # remove the patterns from self.token_pattern
+        self.token_patterns = defaultdict(
+            list,
+            {
+                label: val
+                for (label, val) in self.token_patterns.items()
+                if label not in created_labels
+            },
+        )
+        # remove the patterns from self.token_pattern
+        for label in created_labels:
+            if label in self.phrase_matcher:
+                self.phrase_matcher.remove(label)
+            else:
+                self.matcher.remove(label)
+
     def _require_patterns(self) -> None:
         """Raise a warning if this component has no patterns defined."""
         if len(self) == 0:
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 0cecafff3..f2031d0a9 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -373,3 +373,185 @@ def test_entity_ruler_serialize_dir(nlp, patterns):
         ruler.from_disk(d / "test_ruler")  # read from an existing directory
         with pytest.raises(ValueError):
             ruler.from_disk(d / "non_existing_dir")  # read from a bad directory
+
+
+def test_entity_ruler_remove_basic(nlp):
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+        {"label": "ORG", "pattern": "ACME", "id": "acme"},
+        {"label": "ORG", "pattern": "ACM"},
+    ]
+    ruler.add_patterns(patterns)
+    doc = ruler(nlp.make_doc("Duygu went to school"))
+    assert len(ruler.patterns) == 3
+    assert len(doc.ents) == 1
+    assert doc.ents[0].label_ == "PERSON"
+    assert doc.ents[0].text == "Duygu"
+    assert "PERSON||duygu" in ruler.phrase_matcher
+    ruler.remove("duygu")
+    doc = ruler(nlp.make_doc("Duygu went to school"))
+    assert len(doc.ents) == 0
+    assert "PERSON||duygu" not in ruler.phrase_matcher
+    assert len(ruler.patterns) == 2
+
+
+def test_entity_ruler_remove_same_id_multiple_patterns(nlp):
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+        {"label": "ORG", "pattern": "DuyguCorp", "id": "duygu"},
+        {"label": "ORG", "pattern": "ACME", "id": "acme"},
+    ]
+    ruler.add_patterns(patterns)
+    doc = ruler(nlp.make_doc("Duygu founded DuyguCorp and ACME."))
+    assert len(ruler.patterns) == 3
+    assert "PERSON||duygu" in ruler.phrase_matcher
+    assert "ORG||duygu" in ruler.phrase_matcher
+    assert len(doc.ents) == 3
+    ruler.remove("duygu")
+    doc = ruler(nlp.make_doc("Duygu founded DuyguCorp and ACME."))
+    assert len(ruler.patterns) == 1
+    assert "PERSON||duygu" not in ruler.phrase_matcher
+    assert "ORG||duygu" not in ruler.phrase_matcher
+    assert len(doc.ents) == 1
+
+
+def test_entity_ruler_remove_nonexisting_pattern(nlp):
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+        {"label": "ORG", "pattern": "ACME", "id": "acme"},
+        {"label": "ORG", "pattern": "ACM"},
+    ]
+    ruler.add_patterns(patterns)
+    assert len(ruler.patterns) == 3
+    with pytest.raises(ValueError):
+        ruler.remove("nepattern")
+        assert len(ruler.patterns) == 3
+
+
+def test_entity_ruler_remove_several_patterns(nlp):
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+        {"label": "ORG", "pattern": "ACME", "id": "acme"},
+        {"label": "ORG", "pattern": "ACM"},
+    ]
+    ruler.add_patterns(patterns)
+    doc = ruler(nlp.make_doc("Duygu founded her company ACME."))
+    assert len(ruler.patterns) == 3
+    assert len(doc.ents) == 2
+    assert doc.ents[0].label_ == "PERSON"
+    assert doc.ents[0].text == "Duygu"
+    assert doc.ents[1].label_ == "ORG"
+    assert doc.ents[1].text == "ACME"
+    ruler.remove("duygu")
+    doc = ruler(nlp.make_doc("Duygu founded her company ACME"))
+    assert len(ruler.patterns) == 2
+    assert len(doc.ents) == 1
+    assert doc.ents[0].label_ == "ORG"
+    assert doc.ents[0].text == "ACME"
+    ruler.remove("acme")
+    doc = ruler(nlp.make_doc("Duygu founded her company ACME"))
+    assert len(ruler.patterns) == 1
+    assert len(doc.ents) == 0
+
+
+def test_entity_ruler_remove_patterns_in_a_row(nlp):
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+        {"label": "ORG", "pattern": "ACME", "id": "acme"},
+        {"label": "DATE", "pattern": "her birthday", "id": "bday"},
+        {"label": "ORG", "pattern": "ACM"},
+    ]
+    ruler.add_patterns(patterns)
+    doc = ruler(nlp.make_doc("Duygu founded her company ACME on her birthday"))
+    assert len(doc.ents) == 3
+    assert doc.ents[0].label_ == "PERSON"
+    assert doc.ents[0].text == "Duygu"
+    assert doc.ents[1].label_ == "ORG"
+    assert doc.ents[1].text == "ACME"
+    assert doc.ents[2].label_ == "DATE"
+    assert doc.ents[2].text == "her birthday"
+    ruler.remove("duygu")
+    ruler.remove("acme")
+    ruler.remove("bday")
+    doc = ruler(nlp.make_doc("Duygu went to school"))
+    assert len(doc.ents) == 0
+
+
+def test_entity_ruler_remove_all_patterns(nlp):
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
+        {"label": "ORG", "pattern": "ACME", "id": "acme"},
+        {"label": "DATE", "pattern": "her birthday", "id": "bday"},
+    ]
+    ruler.add_patterns(patterns)
+    assert len(ruler.patterns) == 3
+    ruler.remove("duygu")
+    assert len(ruler.patterns) == 2
+    ruler.remove("acme")
+    assert len(ruler.patterns) == 1
+    ruler.remove("bday")
+    assert len(ruler.patterns) == 0
+    with pytest.warns(UserWarning):
+        doc = ruler(nlp.make_doc("Duygu founded her company ACME on her birthday"))
+        assert len(doc.ents) == 0
+
+
+def test_entity_ruler_remove_and_add(nlp):
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "DATE", "pattern": "last time"}]
+    ruler.add_patterns(patterns)
+    doc = ruler(
+        nlp.make_doc("I saw him last time we met, this time he brought some flowers")
+    )
+    assert len(ruler.patterns) == 1
+    assert len(doc.ents) == 1
+    assert doc.ents[0].label_ == "DATE"
+    assert doc.ents[0].text == "last time"
+    patterns1 = [{"label": "DATE", "pattern": "this time", "id": "ttime"}]
+    ruler.add_patterns(patterns1)
+    doc = ruler(
+        nlp.make_doc("I saw him last time we met, this time he brought some flowers")
+    )
+    assert len(ruler.patterns) == 2
+    assert len(doc.ents) == 2
+    assert doc.ents[0].label_ == "DATE"
+    assert doc.ents[0].text == "last time"
+    assert doc.ents[1].label_ == "DATE"
+    assert doc.ents[1].text == "this time"
+    ruler.remove("ttime")
+    doc = ruler(
+        nlp.make_doc("I saw him last time we met, this time he brought some flowers")
+    )
+    assert len(ruler.patterns) == 1
+    assert len(doc.ents) == 1
+    assert doc.ents[0].label_ == "DATE"
+    assert doc.ents[0].text == "last time"
+    ruler.add_patterns(patterns1)
+    doc = ruler(
+        nlp.make_doc("I saw him last time we met, this time he brought some flowers")
+    )
+    assert len(ruler.patterns) == 2
+    assert len(doc.ents) == 2
+    patterns2 = [{"label": "DATE", "pattern": "another time", "id": "ttime"}]
+    ruler.add_patterns(patterns2)
+    doc = ruler(
+        nlp.make_doc(
+            "I saw him last time we met, this time he brought some flowers, another time some chocolate."
+        )
+    )
+    assert len(ruler.patterns) == 3
+    assert len(doc.ents) == 3
+    ruler.remove("ttime")
+    doc = ruler(
+        nlp.make_doc(
+            "I saw him last time we met, this time he brought some flowers, another time some chocolate."
+        )
+    )
+    assert len(ruler.patterns) == 1
+    assert len(doc.ents) == 1
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index fb33642f8..6d8f835bf 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -210,6 +210,24 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on
 | ---------- | ---------------------------------------------------------------- |
 | `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
 
+
+## EntityRuler.remove {#remove tag="method" new="3.2.1"}
+
+Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if the ID does not exist.
+
+> #### Example
+>
+> ```python
+> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns(patterns)
+> ruler.remove("apple")
+> ```
+
+| Name       | Description                                                      |
+| ---------- | ---------------------------------------------------------------- |
+| `id`       | The ID of the pattern rule. ~~str~~ |
+
 ## EntityRuler.to_disk {#to_disk tag="method"}
 
 Save the entity ruler patterns to a directory. The patterns will be saved as

From 9964243eb2a0a2aa78e7f2377e57b97f8225bba3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 6 Dec 2021 18:04:44 +0100
Subject: [PATCH 095/133] Make the Tagger neg_prefix configurable (#9802)

---
 spacy/pipeline/tagger.pyx | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index c0768dfec..a2bec888e 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -45,7 +45,7 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "tagger",
     assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
+    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!"},
     default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(
@@ -54,6 +54,7 @@ def make_tagger(
     model: Model,
     overwrite: bool,
     scorer: Optional[Callable],
+    neg_prefix: str,
 ):
     """Construct a part-of-speech tagger component.
 
@@ -62,7 +63,7 @@ def make_tagger(
         in size, and be normalized as probabilities (all scores between 0 and 1,
         with the rows summing to 1).
     """
-    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix)
 
 
 def tagger_score(examples, **kwargs):
@@ -87,6 +88,7 @@ class Tagger(TrainablePipe):
         *,
         overwrite=BACKWARD_OVERWRITE,
         scorer=tagger_score,
+        neg_prefix="!",
     ):
         """Initialize a part-of-speech tagger.
 
@@ -103,7 +105,7 @@ class Tagger(TrainablePipe):
         self.model = model
         self.name = name
         self._rehearsal_model = None
-        cfg = {"labels": [], "overwrite": overwrite}
+        cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix}
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
 
@@ -253,7 +255,7 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix="!")
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.

From a0cdc2b007799317e3d8691b31de045dd8a893d1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 6 Dec 2021 20:39:15 +0100
Subject: [PATCH 096/133] Use Language.pipe in evaluate (#9800)

---
 spacy/language.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 204b24ecb..638616316 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1404,20 +1404,13 @@ class Language:
         for eg in examples:
             self.make_doc(eg.reference.text)
         # apply all pipeline components
-        for name, pipe in self.pipeline:
-            kwargs = component_cfg.get(name, {})
-            kwargs.setdefault("batch_size", batch_size)
-            for doc, eg in zip(
-                _pipe(
-                    (eg.predicted for eg in examples),
-                    proc=pipe,
-                    name=name,
-                    default_error_handler=self.default_error_handler,
-                    kwargs=kwargs,
-                ),
-                examples,
-            ):
-                eg.predicted = doc
+        docs = self.pipe(
+            (eg.predicted for eg in examples),
+            batch_size=batch_size,
+            component_cfg=component_cfg,
+        )
+        for eg, doc in zip(examples, docs):
+            eg.predicted = doc
         end_time = timer()
         results = scorer.score(examples)
         n_words = sum(len(eg.predicted) for eg in examples)

From 51a3b60027d3daa9da503b153845a823165a9f8d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 7 Dec 2021 09:42:40 +0100
Subject: [PATCH 097/133] Document Tagger neg_prefix, fix typo (#9821)

---
 website/docs/api/entityrecognizer.md |  2 +-
 website/docs/api/tagger.md           | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 2f7a88fbf..14b6fece4 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -65,7 +65,7 @@ architectures and their arguments and hyperparameters.
 | `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~                                                                                                                                     |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~                                                 |
-| `incorrect_spans_key`         | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~                                                                     |
+| `incorrect_spans_key`         | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER will learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~                                                                      |
 | `scorer`                      | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                                       |
 
 ```python
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index 93b6bc88b..b51864d3a 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -40,11 +40,12 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("tagger", config=config)
 > ```
 
-| Setting                                  | Description                                                                                                                                                                                                                                                                                            |
-| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                  | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                              |
-| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                                                                            |
+| Setting                                     | Description                                                                                                                                                                                                                                                                                            |
+| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                     | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` <Tag variant="new">3.2</Tag>    | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                              |
+| `scorer` <Tag variant="new">3.2</Tag>       | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                                                                            |
+| `neg_prefix` <Tag variant="new">3.2.1</Tag> | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~                                                                                                                                                              |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/tagger.pyx

From 800737b41632c4b8dc24c992c8dd36f073549579 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 7 Dec 2021 10:51:45 +0100
Subject: [PATCH 098/133] Set version to v3.2.1 (#9823)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 29f78805c..c253d5052 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.2.0"
+__version__ = "3.2.1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From ba0fa7a64e7b225ef9cca188b8894f6fda7502a4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 15 Dec 2021 09:27:08 +0100
Subject: [PATCH 099/133] Support Google Sheets embeds in docs (#9861)

---
 website/src/components/embed.js      | 22 +++++++++++++++++++++-
 website/src/styles/embed.module.sass |  4 ++++
 website/src/templates/index.js       |  3 ++-
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/website/src/components/embed.js b/website/src/components/embed.js
index 8d82bfaae..9f959bc99 100644
--- a/website/src/components/embed.js
+++ b/website/src/components/embed.js
@@ -3,6 +3,7 @@ import PropTypes from 'prop-types'
 import classNames from 'classnames'
 
 import Link from './link'
+import Button from './button'
 import { InlineCode } from './code'
 import { markdownToReact } from './util'
 
@@ -104,4 +105,23 @@ const Image = ({ src, alt, title, ...props }) => {
     )
 }
 
-export { YouTube, SoundCloud, Iframe, Image }
+const GoogleSheet = ({ id, link, height, button = 'View full table' }) => {
+    return (
+        <figure className={classes.root}>
+            <iframe
+                title={id}
+                scrolling="no"
+                className={classes.googleSheet}
+                height={height}
+                src={`https://docs.google.com/spreadsheets/d/e/${id}/pubhtml?widget=true&amp;headers=false`}
+            />
+            {link && (
+                <Button href={`https://docs.google.com/spreadsheets/d/${link}/view`}>
+                    {button}
+                </Button>
+            )}
+        </figure>
+    )
+}
+
+export { YouTube, SoundCloud, Iframe, Image, GoogleSheet }
diff --git a/website/src/styles/embed.module.sass b/website/src/styles/embed.module.sass
index ba8a896c8..1eaf7b8d2 100644
--- a/website/src/styles/embed.module.sass
+++ b/website/src/styles/embed.module.sass
@@ -32,3 +32,7 @@
 
 .image-link
     display: block
+
+.google-sheet
+    width: 100%
+    margin-bottom: 1rem
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index 56ac0dbed..353ff6067 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -29,7 +29,7 @@ import Aside from '../components/aside'
 import Button from '../components/button'
 import Tag from '../components/tag'
 import Grid from '../components/grid'
-import { YouTube, SoundCloud, Iframe, Image } from '../components/embed'
+import { YouTube, SoundCloud, Iframe, Image, GoogleSheet } from '../components/embed'
 import Alert from '../components/alert'
 import Search from '../components/search'
 import Project from '../widgets/project'
@@ -72,6 +72,7 @@ const scopeComponents = {
     YouTube,
     SoundCloud,
     Iframe,
+    GoogleSheet,
     Abbr,
     Tag,
     Accordion,

From ac45ae3779f9b021ebc2227ae6b3104811132979 Mon Sep 17 00:00:00 2001
From: antonpibm <51074867+antonpibm@users.noreply.github.com>
Date: Wed, 15 Dec 2021 10:34:33 +0200
Subject: [PATCH 100/133] Update Tokenizer documentation to reflect token_match
 and url_match signatures (#9859)

---
 spacy/tokenizer.pyx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index f8df13610..4a148b356 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -45,10 +45,12 @@ cdef class Tokenizer:
             `re.compile(string).search` to match suffixes.
         `infix_finditer` (callable): A function matching the signature of
             `re.compile(string).finditer` to find infixes.
-        token_match (callable): A boolean function matching strings to be
+        token_match (callable): A function matching the signature of
+            `re.compile(string).match`, for matching strings to be
             recognized as tokens.
-        url_match (callable): A boolean function matching strings to be
-            recognized as tokens after considering prefixes and suffixes.
+        url_match (callable): A function matching the signature of
+            `re.compile(string).match`, for matching strings to be
+            recognized as urls.
 
         EXAMPLE:
             >>> tokenizer = Tokenizer(nlp.vocab)

From 018827e9fde77c1f4d5fce5dd5a9dfdd5e85c360 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Wed, 15 Dec 2021 17:54:44 +0100
Subject: [PATCH 101/133] Add healthsea to universe (#9838)

* Add healthsea to universe

* Update website/meta/universe.json

* Add thumbnail

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/meta/universe.json | 48 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index d11b0e8c5..3817a1f1b 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2774,6 +2774,54 @@
                 "website": "https://yanaiela.github.io"
             }
         },
+        {
+            "id": "Healthsea",
+            "title": "Healthsea",
+            "slogan": "Healthsea: an end-to-end spaCy pipeline for exploring health supplement effects",
+            "description": "This spaCy project trains an NER model and a custom Text Classification model with Clause Segmentation and Blinding capabilities to analyze supplement reviews and their potential effects on health.",
+            "github": "explosion/healthsea",
+            "thumb": "https://github.com/explosion/healthsea/blob/main/img/Jellyfish.png",
+            "category": ["pipeline", "research"],
+            "code_example": [
+                "import spacy",
+                "",
+                "nlp = spacy.load(\"en_healthsea\")",
+                "doc = nlp(\"This is great for joint pain.\")",
+                "",
+                "# Clause Segmentation & Blinding",
+                "print(doc._.clauses)",
+                "",
+                ">     {",
+                ">    \"split_indices\": [0, 7],",
+                ">    \"has_ent\": true,",
+                ">    \"ent_indices\": [4, 6],",
+                ">    \"blinder\": \"_CONDITION_\",",
+                ">    \"ent_name\": \"joint pain\",",
+                ">    \"cats\": {",
+                ">        \"POSITIVE\": 0.9824668169021606,",
+                ">        \"NEUTRAL\": 0.017364952713251114,",
+                ">        \"NEGATIVE\": 0.00002889777533710003,",
+                ">        \"ANAMNESIS\": 0.0001394189748680219",
+                ">    \"prediction_text\": [\"This\", \"is\", \"great\", \"for\", \"_CONDITION_\", \"!\"]",
+                ">    }",
+                "",
+                "# Aggregated results",
+                ">    {",
+                ">    \"joint_pain\": {",
+                ">        \"effects\": [\"POSITIVE\"],",
+                ">        \"effect\": \"POSITIVE\",",
+                ">        \"label\": \"CONDITION\",",
+                ">        \"text\": \"joint pain\"",
+                ">       }",
+                ">    }"
+            ],
+            "author": "Edward Schmuhl",
+            "author_links": {
+                "github": "thomashacker",
+                "twitter": "aestheticedwar1",
+                "website": "https://explosion.ai/"
+            }
+        },
         {
             "id": "presidio",
             "title": "Presidio",

From 94fbd885219852cba9e233ce58d3242aaf9fdb76 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 16 Dec 2021 09:17:33 +0100
Subject: [PATCH 102/133] Use dict.copy().items() instead of list(.items())
 (#9868)

---
 spacy/pipeline/_parser_internals/arc_eager.pyx | 2 +-
 spacy/vectors.pyx                              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index ddcc911c8..029e2e29e 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -604,7 +604,7 @@ cdef class ArcEager(TransitionSystem):
                     actions[SHIFT][''] += 1
         if min_freq is not None:
             for action, label_freqs in actions.items():
-                for label, freq in list(label_freqs.items()):
+                for label, freq in label_freqs.copy().items():
                     if freq < min_freq:
                         label_freqs.pop(label)
         # Ensure these actions are present
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 6d6783af4..1b985a638 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -274,7 +274,7 @@ cdef class Vectors:
             self.data = resized_array
         self._sync_unset()
         removed_items = []
-        for key, row in list(self.key2row.items()):
+        for key, row in self.key2row.copy().items():
             if row >= shape[0]:
                 self.key2row.pop(key)
                 removed_items.append((key, row))

From b08f1ac17dc1904f0937be5dcd7ea574f8b8e6d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 16 Dec 2021 09:31:45 +0100
Subject: [PATCH 103/133] Pin mypy to 0.910 until there is a compatible
 pydantic version

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7e200be51..8d7372cfe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,7 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<3.10.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.910
+mypy==0.910
 types-dataclasses>=0.1.3; python_version < "3.7"
 types-mock>=0.1.1
 types-requests

From 18e5638af04f0f8f4e61735f75715b6ca65bd4b2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 20 Dec 2021 15:48:35 +0100
Subject: [PATCH 104/133] Extend cupy to v10.x (#9911)

* Add extra for `cupy-cuda115`
---
 setup.cfg | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 72f4b39da..50e982cbf 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -77,31 +77,33 @@ transformers =
 ray =
     spacy_ray>=0.1.0,<1.0.0
 cuda =
-    cupy>=5.0.0b4,<10.0.0
+    cupy>=5.0.0b4,<11.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<10.0.0
+    cupy-cuda80>=5.0.0b4,<11.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<10.0.0
+    cupy-cuda90>=5.0.0b4,<11.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<10.0.0
+    cupy-cuda91>=5.0.0b4,<11.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<10.0.0
+    cupy-cuda92>=5.0.0b4,<11.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<10.0.0
+    cupy-cuda100>=5.0.0b4,<11.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<10.0.0
+    cupy-cuda101>=5.0.0b4,<11.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<10.0.0
+    cupy-cuda102>=5.0.0b4,<11.0.0
 cuda110 =
-    cupy-cuda110>=5.0.0b4,<10.0.0
+    cupy-cuda110>=5.0.0b4,<11.0.0
 cuda111 =
-    cupy-cuda111>=5.0.0b4,<10.0.0
+    cupy-cuda111>=5.0.0b4,<11.0.0
 cuda112 =
-    cupy-cuda112>=5.0.0b4,<10.0.0
+    cupy-cuda112>=5.0.0b4,<11.0.0
 cuda113 =
-    cupy-cuda113>=5.0.0b4,<10.0.0
+    cupy-cuda113>=5.0.0b4,<11.0.0
 cuda114 =
-    cupy-cuda114>=5.0.0b4,<10.0.0
+    cupy-cuda114>=5.0.0b4,<11.0.0
+cuda115 =
+    cupy-cuda115>=5.0.0b4,<11.0.0
 apple =
     thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies

From 11630737562118beeb575ba06bf440fb9b013bfb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 20 Dec 2021 16:40:20 +0100
Subject: [PATCH 105/133] Remove outdated patterns MANIFEST.in (#9912)

---
 MANIFEST.in | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index c1524d460..b7826e456 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,11 +1,8 @@
-recursive-include include *.h
 recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
 include LICENSE
 include README.md
 include pyproject.toml
 include spacy/py.typed
-recursive-exclude spacy/lang *.json
-recursive-include spacy/lang *.json.gz
-recursive-include spacy/cli *.json *.yml
+recursive-include spacy/cli *.yml
 recursive-include licenses *
 recursive-exclude spacy *.cpp

From 837d241b686a8fa71fb79a5cbdaf65c178554772 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 20 Dec 2021 17:11:31 +0100
Subject: [PATCH 106/133] Make floret murmurhash endian-neutral (#9735)

---
 spacy/vectors.pyx | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 1b985a638..345e8df68 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,5 +1,5 @@
 cimport numpy as np
-from libc.stdint cimport uint32_t
+from libc.stdint cimport uint32_t, uint64_t
 from cython.operator cimport dereference as deref
 from libcpp.set cimport set as cppset
 from murmurhash.mrmr cimport hash128_x64
@@ -353,12 +353,18 @@ cdef class Vectors:
         key (str): The string key.
         RETURNS: A list of the integer hashes.
         """
-        cdef uint32_t[4] out
+        # MurmurHash3_x64_128 returns an array of 2 uint64_t values.
+        cdef uint64_t[2] out
         chars = s.encode("utf8")
         cdef char* utf8_string = chars
         hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
-        rows = [out[i] for i in range(min(self.hash_count, 4))]
-        return rows
+        rows = [
+            out[0] & 0xffffffffu,
+            out[0] >> 32,
+            out[1] & 0xffffffffu,
+            out[1] >> 32,
+        ]
+        return rows[:min(self.hash_count, 4)]
 
     def _get_ngrams(self, unicode key):
         """Get all padded ngram strings using the ngram settings.

From 72abf9e1021df4300d0f776b80980442247d6a9a Mon Sep 17 00:00:00 2001
From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com>
Date: Mon, 27 Dec 2021 05:18:08 -0500
Subject: [PATCH 107/133] MultiHashEmbed vector docs correction (#9918)

---
 spacy/ml/models/tok2vec.py        | 2 +-
 website/docs/api/architectures.md | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 44ab50e85..ecdf6be27 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -123,7 +123,7 @@ def MultiHashEmbed(
     attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
     account some subword information, without constructing a fully character-based
     representation. If pretrained vectors are available, they can be included in
-    the representation as well, with the vectors table will be kept static
+    the representation as well, with the vectors table kept static
     (i.e. it's not updated).
 
     The `width` parameter specifies the output width of the layer and the widths
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 44ba94d9e..07b76393f 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -158,7 +158,7 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`,
 `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some
 subword information, without construction a fully character-based
 representation. If pretrained vectors are available, they can be included in the
-representation as well, with the vectors table will be kept static (i.e. it's
+representation as well, with the vectors table kept static (i.e. it's
 not updated).
 
 | Name                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                        |
@@ -296,7 +296,7 @@ learned linear projection to control the dimensionality. Unknown tokens are
 mapped to a zero vector. See the documentation on
 [static vectors](/usage/embeddings-transformers#static-vectors) for details.
 
-| Name        |  Description                                                                                                                                                                                                            |
+| Name        | Description                                                                                                                                                                                                             |
 | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `nO`        | The output width of the layer, after the linear projection. ~~Optional[int]~~                                                                                                                                           |
 | `nM`        | The width of the static vectors. ~~Optional[int]~~                                                                                                                                                                      |
@@ -318,7 +318,7 @@ mapped to a zero vector. See the documentation on
 Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
 of feature names to extract, which should refer to token attributes.
 
-| Name        |  Description                                                             |
+| Name        | Description                                                              |
 | ----------- | ------------------------------------------------------------------------ |
 | `columns`   | The token attributes to extract. ~~List[Union[int, str]]~~               |
 | **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |

From 9d63dfacfc85e7cd6db7190bd742dfe240205de5 Mon Sep 17 00:00:00 2001
From: Yoav Vollansky <4323333+yoavxyoav@users.noreply.github.com>
Date: Mon, 27 Dec 2021 14:46:04 +0200
Subject: [PATCH 108/133] Update UNIVERSE.md (#9941)

typo
---
 website/UNIVERSE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/UNIVERSE.md b/website/UNIVERSE.md
index d37c4561a..770bbde13 100644
--- a/website/UNIVERSE.md
+++ b/website/UNIVERSE.md
@@ -44,7 +44,7 @@ markup is correct.
     "id": "unique-project-id",
     "title": "Project title",
     "slogan": "A short summary",
-    "description": "A longer description – *Mardown allowed!*",
+    "description": "A longer description – *Markdown allowed!*",
     "github": "user/repo",
     "pip": "package-name",
     "code_example": [

From f40e237c5a72784034b61425f7d863ce1ac9f46e Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 29 Dec 2021 19:41:29 +0900
Subject: [PATCH 109/133] Remove denomme from universe (#9952)

Package seems to have been deleted.
---
 website/meta/universe.json | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 3817a1f1b..be053507e 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -26,32 +26,6 @@
             "category": ["apis", "nonpython", "standalone"],
             "tags": ["api", "deploy", "production"]
         },
-        {
-            "id": "denomme",
-            "title": "denomme : Multilingual Name Detector",
-            "slogan": "Multilingual Name Detection",
-            "description": "A SpaCy extension for Spans to extract multilingual names out of documents trained on XLM-roberta backbone",
-            "github": "meghanabhange/denomme",
-            "pip": "denomme https://denomme.s3.us-east-2.amazonaws.com/xx_denomme-0.3.1/dist/xx_denomme-0.3.1.tar.gz",
-            "code_example": [
-                "from spacy.lang.xx import MultiLanguage",
-                "from denomme.name import person_name_component",
-                "nlp = MultiLanguage()",
-                "nlp.add_pipe('denomme')",
-                "doc = nlp('Hi my name is Meghana S.R Bhange and I want to talk Asha')",
-                "print(doc._.person_name)",
-                "# ['Meghana S.R Bhange', 'Asha']"
-            ],
-            "thumb": "https://i.ibb.co/jwGVWPZ/rainbow-bohemian-logo-removebg-preview.png",
-            "code_language": "python",
-            "author": "Meghana Bhange",
-            "author_links": {
-                "github": "meghanabhange",
-                "twitter": "_aspiringcat"
-            },
-            "category": ["standalone"],
-            "tags": ["person-name-detection"]
-        },
         {
             "id": "eMFDscore",
             "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python",

From 6f65e2b5449d854d5342d379e713ef3645c52a74 Mon Sep 17 00:00:00 2001
From: Sam Edwardes <edwardes.s@gmail.com>
Date: Sun, 2 Jan 2022 23:34:36 -0800
Subject: [PATCH 110/133] Added spacypdfreader to universe.json (#9963)

---
 website/meta/universe.json | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index be053507e..384a7e070 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,43 @@
 {
     "resources": [
+        {
+            "id": "spacypdfreader",
+            "title": "spadypdfreader",
+            "category": ["pipeline"],
+            "tags": ["PDF"],
+            "slogan": "Easy PDF to text to spaCy text extraction in Python.",
+            "description": "*spacypdfreader* is a Python library that allows you to convert PDF files directly into *spaCy* `Doc` objects. The library provides several built in parsers or bring your own parser. `Doc` objects are annotated with several custom attributes including: `token._.page_number`, `doc._.page_range`, `doc._.first_page`, `doc._.last_page`, `doc._.pdf_file_name`, and `doc._.page(int)`.",
+            "github": "SamEdwardes/spacypdfreader",
+            "pip": "spacypdfreader",
+            "url": "https://samedwardes.github.io/spacypdfreader/",
+            "code_language": "python",
+            "author": "Sam Edwardes",
+            "author_links": {
+                "twitter": "TheReaLSamlam",
+                "github": "SamEdwardes",
+                "website": "https://samedwardes.com"
+            },
+            "code_example": [
+                "import spacy",
+                "from spacypdfreader import pdf_reader",
+                "",
+                "nlp = spacy.load('en_core_web_sm')",
+                "doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)",
+                "",
+                "# Get the page number of any token.",
+                "print(doc[0]._.page_number)  # 1",
+                "print(doc[-1]._.page_number) # 4",
+                "",
+                "# Get page meta data about the PDF document.",
+                "print(doc._.pdf_file_name)   # 'tests/data/test_pdf_01.pdf'",
+                "print(doc._.page_range)      # (1, 4)",
+                "print(doc._.first_page)      # 1",
+                "print(doc._.last_page)       # 4",
+                "",
+                "# Get all of the text from a specific PDF page.",
+                "print(doc._.page(4))         # 'able to display the destination page (unless...'"
+            ]
+        },
         {
             "id": "nlpcloud",
             "title": "NLPCloud.io",

From 029a48e3408f95a7e2ca87ff825e97625f010f93 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 4 Jan 2022 13:15:25 +0100
Subject: [PATCH 111/133] fix type of lexeme.rank (#9979)

---
 spacy/lexeme.pyi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi
index 4eae6be43..4fcaa82cf 100644
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@@ -19,7 +19,7 @@ class Lexeme:
     @property
     def vector_norm(self) -> float: ...
     vector: Floats1d
-    rank: str
+    rank: int
     sentiment: float
     @property
     def orth_(self) -> str: ...

From 56dcb39fb7cf3d51061f7ead1c95908a68fe9c58 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 4 Jan 2022 14:31:26 +0100
Subject: [PATCH 112/133] Fix references to config file in the docs & UX
 (#9961)

* doc fixes around config file

* fix typo

* clarify default
---
 spacy/cli/debug_config.py        |  2 +-
 spacy/cli/init_config.py         | 10 +++++-----
 spacy/errors.py                  |  2 +-
 spacy/util.py                    |  8 ++++----
 website/docs/api/cli.md          |  8 ++++----
 website/docs/api/data-formats.md |  2 +-
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 56ee12336..409fac4ed 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -25,7 +25,7 @@ def debug_config_cli(
     show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
     # fmt: on
 ):
-    """Debug a config.cfg file and show validation errors. The command will
+    """Debug a config file and show validation errors. The command will
     create all objects in the tree and validate them. Note that some config
     validation errors are blocking and will prevent the rest of the config from
     being resolved. This means that you may not see all validation errors at
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 530b38eb3..d4cd939c2 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -27,7 +27,7 @@ class Optimizations(str, Enum):
 @init_cli.command("config")
 def init_config_cli(
     # fmt: off
-    output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
+    output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
     lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
     pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
     optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
@@ -37,7 +37,7 @@ def init_config_cli(
     # fmt: on
 ):
     """
-    Generate a starter config.cfg for training. Based on your requirements
+    Generate a starter config file for training. Based on your requirements
     specified via the CLI arguments, this command generates a config with the
     optimal settings for your use case. This includes the choice of architecture,
     pretrained weights and related hyperparameters.
@@ -66,15 +66,15 @@ def init_config_cli(
 @init_cli.command("fill-config")
 def init_fill_config_cli(
     # fmt: off
-    base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
-    output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
+    base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
+    output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
     pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
     diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
     code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     # fmt: on
 ):
     """
-    Fill partial config.cfg with default values. Will add all missing settings
+    Fill partial config file with default values. Will add all missing settings
     from the default config and will create all objects, check the registered
     functions for their default values and update the base config. This command
     can be used with a config generated via the training quickstart widget:
diff --git a/spacy/errors.py b/spacy/errors.py
index ad7a0280f..673674222 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -642,7 +642,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
             "for mode '{mode}'. Required tables: {tables}. Found: {found}.")
     E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
-            "config.cfg or override it on the CLI?")
+            ".cfg file or override it on the CLI?")
     E914 = ("Executing {name} callback failed. Expected the function to "
             "return the nlp object but got: {value}. Maybe you forgot to return "
             "the modified object in your function?")
diff --git a/spacy/util.py b/spacy/util.py
index 4424f6897..14714143c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -63,7 +63,7 @@ OOV_RANK = numpy.iinfo(numpy.uint64).max
 DEFAULT_OOV_PROB = -20
 LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
 
-# Default order of sections in the config.cfg. Not all sections needs to exist,
+# Default order of sections in the config file. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
 # fmt: on
@@ -465,7 +465,7 @@ def load_model_from_path(
     """Load a model from a data directory path. Creates Language class with
     pipeline from config.cfg and then calls from_disk() with path.
 
-    model_path (Path): Mmodel path.
+    model_path (Path): Model path.
     meta (Dict[str, Any]): Optional model meta.
     vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
         a new Vocab object will be created.
@@ -642,8 +642,8 @@ def load_config(
             sys.stdin.read(), overrides=overrides, interpolate=interpolate
         )
     else:
-        if not config_path or not config_path.exists() or not config_path.is_file():
-            raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
+        if not config_path or not config_path.is_file():
+            raise IOError(Errors.E053.format(path=config_path, name="config file"))
         return config.from_disk(
             config_path, overrides=overrides, interpolate=interpolate
         )
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index b872181f9..89e2e87d9 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -148,8 +148,8 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
 
 ### init fill-config {#init-fill-config new="3"}
 
-Auto-fill a partial [`config.cfg` file](/usage/training#config) file with **all
-default values**, e.g. a config generated with the
+Auto-fill a partial [.cfg file](/usage/training#config) with **all default
+values**, e.g. a config generated with the
 [quickstart widget](/usage/training#quickstart). Config files used for training
 should always be complete and not contain any hidden defaults or missing values,
 so this command helps you create your final training config. In order to find
@@ -175,7 +175,7 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 | Name                   | Description                                                                                                                                                                          |
 | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                            |
-| `output_file`          | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~                                                  |
+| `output_file`          | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                   |
 | `--code`, `-c`         | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                    |
 | `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                        |
@@ -208,7 +208,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
-| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~                                                                                                                                                   |
+| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~                                                                                                                                      |
 | `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
 | `--verbose`, `-V`  | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
 | `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index c51a6dbca..b7aedc511 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -535,7 +535,7 @@ As of spaCy v3.0, the `meta.json` **isn't** used to construct the language class
 and pipeline anymore and only contains meta information for reference and for
 creating a Python package with [`spacy package`](/api/cli#package). How to set
 up the `nlp` object is now defined in the
-[`config.cfg`](/api/data-formats#config), which includes detailed information
+[config file](/api/data-formats#config), which includes detailed information
 about the pipeline components and their model architectures, and all other
 settings and hyperparameters used to train the pipeline. It's the **single
 source of truth** used for loading a pipeline.

From 55cf4922189a958519d7c890ec5f1353f22cbbda Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Tue, 4 Jan 2022 18:22:10 +0100
Subject: [PATCH 113/133] Feat/debug data warn spread ents (#9960)

* added check for crossing boundaries

* formatted blacked

* Rephrasing slightly

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/cli/debug_data.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 3143e2c62..688b07a9b 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -203,6 +203,7 @@ def debug_data(
         has_low_data_warning = False
         has_no_neg_warning = False
         has_ws_ents_error = False
+        has_boundary_cross_ents_warning = False
 
         msg.divider("Named Entity Recognition")
         msg.info(f"{len(model_labels)} label(s)")
@@ -242,12 +243,20 @@ def debug_data(
                     msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                     has_no_neg_warning = True
 
+        if gold_train_data["boundary_cross_ents"]:
+            msg.warn(
+                f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
+            )
+            has_boundary_cross_ents_warning = True
+
         if not has_low_data_warning:
             msg.good("Good amount of examples for all labels")
         if not has_no_neg_warning:
             msg.good("Examples without occurrences available for all labels")
         if not has_ws_ents_error:
             msg.good("No entities consisting of or starting/ending with whitespace")
+        if not has_boundary_cross_ents_warning:
+            msg.good("No entities crossing sentence boundaries")
 
         if has_low_data_warning:
             msg.text(
@@ -565,6 +574,7 @@ def _compile_gold(
         "words": Counter(),
         "roots": Counter(),
         "ws_ents": 0,
+        "boundary_cross_ents": 0,
         "n_words": 0,
         "n_misaligned_words": 0,
         "words_missing_vectors": Counter(),
@@ -602,6 +612,8 @@ def _compile_gold(
                 if label.startswith(("B-", "U-")):
                     combined_label = label.split("-")[1]
                     data["ner"][combined_label] += 1
+                if gold[i].is_sent_start and label.startswith(("I-", "L-")):
+                    data["boundary_cross_ents"] += 1
                 elif label == "-":
                     data["ner"]["-"] += 1
         if "textcat" in factory_names or "textcat_multilabel" in factory_names:

From 00e7bf5ffdd44eb327bbd85e03a00b3495221c94 Mon Sep 17 00:00:00 2001
From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
Date: Wed, 5 Jan 2022 16:16:40 +0800
Subject: [PATCH 114/133] Add a few docs to the default_config.cfg (#9981)

* Clarify patience hyperparameter

The current value for patience doesn't seem to indicate that it's
pointing to the number of steps. It may be useful to specify that
explicitly.

Ref: https://github.com/explosion/spaCy/discussions/7450
Ref: https://github.com/explosion/spaCy/discussions/7465

* Update docs for max_steps
---
 spacy/default_config.cfg | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index ceb7357fc..86a72926e 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -68,12 +68,14 @@ seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
-# Controls early-stopping. 0 disables early stopping.
+# Controls early-stopping, i.e., the number of steps to continue without
+# improvement before stopping. 0 disables early stopping.
 patience = 1600
 # Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
 # memory and shuffled within the training loop. -1 means stream train corpus
 # rather than loading in memory with no shuffling within the training loop.
 max_epochs = 0
+# Maximum number of update steps to train for. 0 means an unlimited number of steps.
 max_steps = 20000
 eval_frequency = 200
 # Control how scores are printed and checkpoints are evaluated.

From a437ca673793e7d21695bc552f700aa5f01e2eb0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 5 Jan 2022 13:21:06 +0100
Subject: [PATCH 115/133] Update website to use new Algolia search API

---
 website/meta/site.json           | 3 ++-
 website/src/components/search.js | 3 ++-
 website/src/pages/404.js         | 1 +
 website/src/templates/index.js   | 1 +
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/website/meta/site.json b/website/meta/site.json
index b8f1a58ef..169680f86 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -22,7 +22,8 @@
         "list": "89ad33e698"
     },
     "docSearch": {
-        "apiKey": "371e26ed49d29a27bd36273dfdaf89af",
+        "appId": "Y1LB128RON",
+        "apiKey": "bb601a1daab73e2dc66faf2b79564807",
         "indexName": "spacy"
     },
     "binderUrl": "explosion/spacy-io-binder",
diff --git a/website/src/components/search.js b/website/src/components/search.js
index eeab9ef40..65d6f235a 100644
--- a/website/src/components/search.js
+++ b/website/src/components/search.js
@@ -6,13 +6,14 @@ import Icon from './icon'
 import classes from '../styles/search.module.sass'
 
 export default function Search({ id = 'docsearch', placeholder = 'Search docs', settings = {} }) {
-    const { apiKey, indexName } = settings
+    const { apiKey, indexName, appId } = settings
     if (!apiKey && !indexName) return null
     const [initialized, setInitialized] = useState(false)
     useEffect(() => {
         if (!initialized) {
             setInitialized(true)
             window.docsearch({
+                appId,
                 apiKey,
                 indexName,
                 inputSelector: `#${id}`,
diff --git a/website/src/pages/404.js b/website/src/pages/404.js
index 4bdd43af6..53baebab9 100644
--- a/website/src/pages/404.js
+++ b/website/src/pages/404.js
@@ -41,6 +41,7 @@ export const pageQuery = graphql`
                 docSearch {
                     apiKey
                     indexName
+                    appId
                 }
             }
         }
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index 353ff6067..dfd59e424 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -235,6 +235,7 @@ export const pageQuery = graphql`
                 docSearch {
                     apiKey
                     indexName
+                    appId
                 }
             }
         }

From 5ba4171b19f02953fd82e37767ad120f107175a1 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 7 Jan 2022 09:24:07 +0100
Subject: [PATCH 116/133] Update LICENSE to include 2022 [ci skip]

---
 LICENSE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENSE b/LICENSE
index 86f501b92..d76864579 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
+Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From 057b8c64c018bf0ae2d6984283aef6606cb426dd Mon Sep 17 00:00:00 2001
From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com>
Date: Wed, 12 Jan 2022 10:34:23 +0100
Subject: [PATCH 117/133] Check for assets with size of 0 bytes (#10026)

* Check for assets with size of 0 bytes

* Update spacy/cli/project/assets.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/cli/project/assets.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index b5057e401..5e0cdfdf2 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict, Optional
 from pathlib import Path
 from wasabi import msg
+import os
 import re
 import shutil
 import requests
@@ -129,10 +130,17 @@ def fetch_asset(
         the asset failed.
     """
     dest_path = (project_path / dest).resolve()
-    if dest_path.exists() and checksum:
+    if dest_path.exists():
         # If there's already a file, check for checksum
-        if checksum == get_checksum(dest_path):
-            msg.good(f"Skipping download with matching checksum: {dest}")
+        if checksum:
+            if checksum == get_checksum(dest_path):
+                msg.good(f"Skipping download with matching checksum: {dest}")
+                return
+        else:
+            # If there's not a checksum, make sure the file is a possibly valid size
+            if os.path.getsize(dest_path) == 0:
+                msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
+                os.remove(dest_path)
     # We might as well support the user here and create parent directories in
     # case the asset dir isn't listed as a dir to create in the project.yml
     if not dest_path.parent.exists():

From 28299644fc14ed7693a26bf03e2ec0cbef9c28e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 13 Jan 2022 09:03:55 +0100
Subject: [PATCH 118/133] Speed up the StateC::L feature function (#10019)

* Speed up the StateC::L feature function

This function gets the n-th most-recent left-arc with a particular head.
Before this change, StateC::L would construct a vector of all left-arcs
with the given head and then pick the n-th most recent from that vector.
Since the number of left-arcs strongly correlates with the doc length
and the feature is constructed for every transition, this can make
transition-parsing quadratic.

With this change StateC::L:

- Searches left-arcs backwards.
- Stops early when the n-th matching transition is found.
- Does not construct a vector (reducing memory pressure).

This change doesn't avoid the linear search when the transition that is
queried does not occur in the left-arcs. Regardless, performance is
improved quite a bit with very long docs:

Before:

   N  Time

 400   3.3
 800   5.4
1600  11.6
3200  30.7

After:

   N  Time

 400   3.2
 800   5.0
1600   9.5
3200  23.2

We can probably do better with more tailored data structures, but I
first wanted to make a low-impact PR.

Found while investigating #9858.

* StateC::L: simplify loop
---
 spacy/pipeline/_parser_internals/_state.pxd | 23 +++++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 161f3ca48..27623e7c6 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,3 +1,4 @@
+from cython.operator cimport dereference as deref, preincrement as incr
 from libc.string cimport memcpy, memset
 from libc.stdlib cimport calloc, free
 from libc.stdint cimport uint32_t, uint64_t
@@ -184,16 +185,20 @@ cdef cppclass StateC:
     int L(int head, int idx) nogil const:
         if idx < 1 or this._left_arcs.size() == 0:
             return -1
-        cdef vector[int] lefts
-        for i in range(this._left_arcs.size()):
-            arc = this._left_arcs.at(i)
+
+        # Work backwards through left-arcs to find the arc at the
+        # requested index more quickly.
+        cdef size_t child_index = 0
+        it = this._left_arcs.const_rbegin()
+        while it != this._left_arcs.rend():
+            arc = deref(it)
             if arc.head == head and arc.child != -1 and arc.child < head:
-                lefts.push_back(arc.child)
-        idx = (<int>lefts.size()) - idx
-        if idx < 0:
-            return -1
-        else:
-            return lefts.at(idx)
+                child_index += 1
+                if child_index == idx:
+                    return arc.child
+            incr(it)
+
+        return -1
 
     int R(int head, int idx) nogil const:
         if idx < 1 or this._right_arcs.size() == 0:

From a784b12eff48df9281b184cb7005e66bbd2e3aca Mon Sep 17 00:00:00 2001
From: ColleterVi <36503688+ColleterVi@users.noreply.github.com>
Date: Thu, 13 Jan 2022 12:25:06 +0100
Subject: [PATCH 119/133] fix: new restcountries url (#10043)

Url extension "eu" and path "rest" are no longer available. Replacing them for a working url.
---
 website/docs/usage/processing-pipelines.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 0264a2825..11fd1459d 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1479,7 +1479,7 @@ especially useful it you want to pass in a string instead of calling
 ### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3}
 
 This example shows the implementation of a pipeline component that fetches
-country meta data via the [REST Countries API](https://restcountries.eu), sets
+country meta data via the [REST Countries API](https://restcountries.com), sets
 entity annotations for countries and sets custom attributes on the `Doc` and
 `Span` – for example, the capital, latitude/longitude coordinates and even the
 country flag.
@@ -1495,7 +1495,7 @@ from spacy.tokens import Doc, Span, Token
 @Language.factory("rest_countries")
 class RESTCountriesComponent:
     def __init__(self, nlp, name, label="GPE"):
-        r = requests.get("https://restcountries.eu/rest/v2/all")
+        r = requests.get("https://restcountries.com/v2/all")
         r.raise_for_status()  # make sure requests raises an error if it fails
         countries = r.json()
         # Convert API response to dict keyed by country name for easy lookup

From 58bdd8607bb917f3437fdf5993dec5b6e58930c8 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 17 Jan 2022 16:16:22 +0900
Subject: [PATCH 120/133] Bump sudachipy version (#9917)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Edited Slovenian stop words list (#9707)

* Noun chunks for Italian (#9662)

* added it vocab

* copied portuguese

* added possessive determiner

* added conjed Nps

* added nmoded Nps

* test misc

* more examples

* fixed typo

* fixed parenth

* fixed comma

* comma fix

* added syntax iters

* fix some index problems

* fixed index

* corrected heads for test case

* fixed tets case

* fixed determiner gender

* cleaned left over

* added example with apostophe

* French NP review (#9667)

* adapted from pt

* added basic tests

* added fr vocab

* fixed noun chunks

* more examples

* typo fix

* changed naming

* changed the naming

* typo fix

* Add Japanese kana characters to default exceptions (fix #9693) (#9742)

This includes the main kana, or phonetic characters, used in Japanese.

There are some supplemental kana blocks in Unicode outside the BMP that
could also be included, but because their actual use is rare I omitted
them for now, but maybe they should be added. The omitted blocks are:

- Kana Supplement
- Kana Extended (A and B)
- Small Kana Extension

* Remove NER words from stop words in Norwegian (#9820)

Default stop words in Norwegian bokmål (nb) in Spacy contain important entities, e.g. France, Germany, Russia, Sweden and USA, police district, important units of time, e.g. months and days of the week, and organisations.

Nobody expects their presence among the default stop words. There is a danger of users complying with the general recommendation of filtering out stop words, while being unaware of filtering out important entities from their data.

See explanation in https://github.com/explosion/spaCy/issues/3052#issuecomment-986756711 and comment https://github.com/explosion/spaCy/issues/3052#issuecomment-986951831

* Bump sudachipy version

* Update sudachipy versions

* Bump versions

Bumping to the most recent dictionary just to keep thing current.
Bumping sudachipy to 5.2 because older versions don't support recent
dictionaries.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Richard Hudson <richard@explosion.ai>
Co-authored-by: Duygu Altinok <duygu@explosion.ai>
Co-authored-by: Haakon Meland Eriksen <haakon.eriksen@far.no>
---
 setup.cfg                               |   4 +-
 spacy/lang/char_classes.py              |   5 +
 spacy/lang/fr/syntax_iterators.py       |  72 ++++++--
 spacy/lang/it/__init__.py               |   4 +-
 spacy/lang/it/syntax_iterators.py       |  86 +++++++++
 spacy/lang/nb/stop_words.py             |  30 ++--
 spacy/lang/sl/stop_words.py             | 130 +-------------
 spacy/tests/conftest.py                 |  10 ++
 spacy/tests/lang/fr/test_noun_chunks.py | 224 +++++++++++++++++++++++-
 spacy/tests/lang/it/test_noun_chunks.py | 221 +++++++++++++++++++++++
 10 files changed, 624 insertions(+), 162 deletions(-)
 create mode 100644 spacy/lang/it/syntax_iterators.py
 create mode 100644 spacy/tests/lang/it/test_noun_chunks.py

diff --git a/setup.cfg b/setup.cfg
index 50e982cbf..586a044ff 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -108,8 +108,8 @@ apple =
     thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
-    sudachipy>=0.4.9
-    sudachidict_core>=20200330
+    sudachipy>=0.5.2,!=0.6.1
+    sudachidict_core>=20211220
 ko =
     natto-py==0.9.0
 th =
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 9e5441a4f..b15bb3cf3 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
 _hangul_jamo = r"\u1100-\u11FF"
 _hangul = _hangul_syllables + _hangul_jamo
 
+_hiragana = r"\u3040-\u309F"
+_katakana = r"\u30A0-\u30FFー"
+_kana = _hiragana + _katakana
+
 # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
 _latin_u_extendedA = (
     r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@@ -244,6 +248,7 @@ _uncased = (
     + _tamil
     + _telugu
     + _hangul
+    + _kana
     + _cjk
 )
 
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index d86662693..5f7ba5c10 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -6,16 +6,35 @@ from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
-    # fmt: off
-    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
-    # fmt: on
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "obl:agent",
+        "obl:arg",
+        "obl:mod",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
     doc = doclike.doc  # Ensure works on both Doc and Span.
     if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
-    np_deps = [doc.vocab.strings[label] for label in labels]
-    conj = doc.vocab.strings.add("conj")
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
     np_label = doc.vocab.strings.add("NP")
+    adj_label = doc.vocab.strings.add("amod")
+    det_label = doc.vocab.strings.add("det")
+    det_pos = doc.vocab.strings.add("DET")
+    adp_pos = doc.vocab.strings.add("ADP")
+    conj_label = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
     prev_end = -1
     for i, word in enumerate(doclike):
         if word.pos not in (NOUN, PROPN, PRON):
@@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
         if word.left_edge.i <= prev_end:
             continue
         if word.dep in np_deps:
-            prev_end = word.right_edge.i
-            yield word.left_edge.i, word.right_edge.i + 1, np_label
-        elif word.dep == conj:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
+
+            if right_child:
+                if (
+                    right_child.dep == adj_label
+                ):  # allow chain of adjectives by expanding to right
+                    right_end = right_child.right_edge
+                elif (
+                    right_child.dep == det_label and right_child.pos == det_pos
+                ):  # cut relative pronouns here
+                    right_end = right_child
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
+            else:
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_pos else left_index
+            )
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj_label:
             head = word.head
-            while head.dep == conj and head.head.i < head.i:
+            while head.dep == conj_label and head.head.i < head.i:
                 head = head.head
             # If the head is an NP, and we're coordinated to it, we're an NP
             if head.dep in np_deps:
-                prev_end = word.right_edge.i
-                yield word.left_edge.i, word.right_edge.i + 1, np_label
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
 
 
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index 1edebc837..ecf322bd7 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults
 from .lemmatizer import ItalianLemmatizer
+from .syntax_iterators import SYNTAX_ITERATORS
 
 
 class ItalianDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-    stop_words = STOP_WORDS
     prefixes = TOKENIZER_PREFIXES
     infixes = TOKENIZER_INFIXES
+    stop_words = STOP_WORDS
+    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Italian(Language):
diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py
new file mode 100644
index 000000000..f63df3fad
--- /dev/null
+++ b/spacy/lang/it/syntax_iterators.py
@@ -0,0 +1,86 @@
+from typing import Union, Iterator, Tuple
+
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "obl:agent",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
+    dets = ["det", "det:poss"]
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
+    np_label = doc.vocab.strings.add("NP")
+    adj_label = doc.vocab.strings.add("amod")
+    det_labels = {doc.vocab.strings.add(det) for det in dets}
+    det_pos = doc.vocab.strings.add("DET")
+    adp_label = doc.vocab.strings.add("ADP")
+    conj = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
+        if word.dep in np_deps:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
+
+            if right_child:
+                if (
+                    right_child.dep == adj_label
+                ):  # allow chain of adjectives by expanding to right
+                    right_end = right_child.right_edge
+                elif (
+                    right_child.dep in det_labels and right_child.pos == det_pos
+                ):  # cut relative pronouns here
+                    right_end = right_child
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
+            else:
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_label else left_index
+            )
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py
index fd65dd788..d9ed414ef 100644
--- a/spacy/lang/nb/stop_words.py
+++ b/spacy/lang/nb/stop_words.py
@@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av
 
 bak bare bedre beste blant ble bli blir blitt bris by både
 
-da dag de del dem den denne der dermed det dette disse drept du
+da dag de del dem den denne der dermed det dette disse du
 
 eller en enn er et ett etter
 
-fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
+fem fikk fire fjor flere folk for fortsatt fra fram
 funnet få får fått før først første
 
 gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
 
-ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
-hvorfor
+ha hadde ham han hans har hele helt henne hennes her hun
 
 i ifølge igjen ikke ingen inn
 
 ja jeg
 
 kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
-kvinner
 
-la laget land landet langt leder ligger like litt løpet lørdag
+la laget land landet langt leder ligger like litt løpet
 
-man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
-millioner minutter mot msci mye må mål måtte
+man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte
 
-ned neste noe noen nok norge norsk norske ntb ny nye nå når
+ned neste noe noen nok ny nye nå når
 
-og også om onsdag opp opplyser oslo oss over
+og også om opp opplyser oss over
 
-personer plass poeng politidistrikt politiet president prosent på
+personer plass poeng på
 
-regjeringen runde rundt russland
+runde rundt
 
-sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
+sa saken samme sammen samtidig satt se seg seks selv senere ser sett
 siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
-store står sverige svært så søndag
+store står svært så
 
-ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
-tyskland
+ta tatt tid tidligere til tilbake tillegg tok tror
 
-under usa ut uten utenfor
+under ut uten utenfor
 
 vant var ved veldig vi videre viktig vil ville viser vår være vært
 
diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py
index 6fb01a183..c9004ed5d 100644
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@@ -1,13 +1,10 @@
 # Source: https://github.com/stopwords-iso/stopwords-sl
-# TODO: probably needs to be tidied up – the list seems to have month names in
-# it, which shouldn't be considered stop words.
+# Removed various words that are not normally considered stop words, such as months.
 
 STOP_WORDS = set(
     """
 a
 ali
-april
-avgust
 b
 bi
 bil
@@ -19,7 +16,6 @@ biti
 blizu
 bo
 bodo
-bojo
 bolj
 bom
 bomo
@@ -37,16 +33,6 @@ da
 daleč
 dan
 danes
-datum
-december
-deset
-deseta
-deseti
-deseto
-devet
-deveta
-deveti
-deveto
 do
 dober
 dobra
@@ -54,16 +40,7 @@ dobri
 dobro
 dokler
 dol
-dolg
-dolga
-dolgi
 dovolj
-drug
-druga
-drugi
-drugo
-dva
-dve
 e
 eden
 en
@@ -74,7 +51,6 @@ enkrat
 eno
 etc.
 f
-februar
 g
 g.
 ga
@@ -93,16 +69,12 @@ iv
 ix
 iz
 j
-januar
 jaz
 je
 ji
 jih
 jim
 jo
-julij
-junij
-jutri
 k
 kadarkoli
 kaj
@@ -123,41 +95,23 @@ kje
 kjer
 kjerkoli
 ko
-koder
 koderkoli
 koga
 komu
 kot
-kratek
-kratka
-kratke
-kratki
 l
-lahka
-lahke
-lahki
-lahko
 le
 lep
 lepa
 lepe
 lepi
 lepo
-leto
 m
-maj
-majhen
-majhna
-majhni
-malce
-malo
 manj
-marec
 me
 med
 medtem
 mene
-mesec
 mi
 midva
 midve
@@ -183,7 +137,6 @@ najmanj
 naju
 največ
 nam
-narobe
 nas
 nato
 nazaj
@@ -192,7 +145,6 @@ naša
 naše
 ne
 nedavno
-nedelja
 nek
 neka
 nekaj
@@ -236,7 +188,6 @@ njuna
 njuno
 no
 nocoj
-november
 npr.
 o
 ob
@@ -244,51 +195,23 @@ oba
 obe
 oboje
 od
-odprt
-odprta
-odprti
 okoli
-oktober
 on
 onadva
 one
 oni
 onidve
-osem
-osma
-osmi
-osmo
 oz.
 p
 pa
-pet
-peta
-petek
-peti
-peto
 po
 pod
 pogosto
 poleg
-poln
-polna
-polni
-polno
 ponavadi
-ponedeljek
 ponovno
 potem
 povsod
-pozdravljen
-pozdravljeni
-prav
-prava
-prave
-pravi
-pravo
-prazen
-prazna
-prazno
 prbl.
 precej
 pred
@@ -297,19 +220,10 @@ preko
 pri
 pribl.
 približno
-primer
-pripravljen
-pripravljena
-pripravljeni
 proti
-prva
-prvi
-prvo
 r
-ravno
 redko
 res
-reč
 s
 saj
 sam
@@ -321,29 +235,17 @@ se
 sebe
 sebi
 sedaj
-sedem
-sedma
-sedmi
-sedmo
 sem
-september
 seveda
 si
 sicer
 skoraj
 skozi
-slab
 smo
 so
-sobota
 spet
-sreda
-srednja
-srednji
 sta
 ste
-stran
-stvar
 sva
 t
 ta
@@ -358,10 +260,6 @@ te
 tebe
 tebi
 tega
-težak
-težka
-težki
-težko
 ti
 tista
 tiste
@@ -371,11 +269,6 @@ tj.
 tja
 to
 toda
-torek
-tretja
-tretje
-tretji
-tri
 tu
 tudi
 tukaj
@@ -392,10 +285,6 @@ vaša
 vaše
 ve
 vedno
-velik
-velika
-veliki
-veliko
 vendar
 ves
 več
@@ -403,10 +292,6 @@ vi
 vidva
 vii
 viii
-visok
-visoka
-visoke
-visoki
 vsa
 vsaj
 vsak
@@ -420,34 +305,21 @@ vsega
 vsi
 vso
 včasih
-včeraj
 x
 z
 za
 zadaj
 zadnji
 zakaj
-zaprta
-zaprti
-zaprto
 zdaj
 zelo
 zunaj
 č
 če
 često
-četrta
-četrtek
-četrti
-četrto
 čez
 čigav
 š
-šest
-šesta
-šesti
-šesto
-štiri
 ž
 že
 """.split()
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index ffca79bb9..ee90a9f38 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -155,6 +155,11 @@ def fr_tokenizer():
     return get_lang_class("fr")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def fr_vocab():
+    return get_lang_class("fr")().vocab
+
+
 @pytest.fixture(scope="session")
 def ga_tokenizer():
     return get_lang_class("ga")().tokenizer
@@ -205,6 +210,11 @@ def it_tokenizer():
     return get_lang_class("it")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def it_vocab():
+    return get_lang_class("it")().vocab
+
+
 @pytest.fixture(scope="session")
 def ja_tokenizer():
     pytest.importorskip("sudachipy")
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index 48ac88ead..25b95f566 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -1,8 +1,230 @@
+from spacy.tokens import Doc
 import pytest
 
 
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # determiner + noun
+        # un nom -> un nom
+        (
+            ["un", "nom"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # determiner + noun starting with vowel
+        # l'heure -> l'heure
+        (
+            ["l'", "heure"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # determiner + plural noun
+        # les romans -> les romans
+        (
+            ["les", "romans"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # det + adj + noun
+        # Le vieux Londres  -> Le vieux Londres 
+        (
+            ['Les', 'vieux', 'Londres'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # det + noun + adj
+        # le nom propre  -> le nom propre   a proper noun
+        (
+            ["le", "nom", "propre"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # det + noun + adj plural
+        # Les chiens bruns  -> les chiens bruns
+        (
+            ["Les", "chiens", "bruns"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # multiple adjectives: one adj before the noun, one adj after the noun
+        # un nouveau film intéressant -> un nouveau film intéressant
+        (
+            ["un", "nouveau", "film", "intéressant"],
+            [2, 2, 2, 2],
+            ["det", "amod", "ROOT", "amod"],
+            ["DET", "ADJ", "NOUN", "ADJ"],
+            [(0,4)]
+        ),
+        # multiple adjectives, both adjs after the noun
+        # une personne intelligente et drôle -> une personne intelligente et drôle
+        (
+            ["une", "personne", "intelligente", "et", "drôle"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # relative pronoun
+        # un bus qui va au ville -> un bus, qui, ville
+        (
+            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
+            [1, 1, 3, 1, 5, 3],
+            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
+            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
+            [(0,2), (2,3), (5,6)]
+        ),
+        # relative subclause
+        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
+        (
+            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
+            [0, 2, 0, 5, 5, 2, 5],
+            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
+            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
+            [(1,3), (4,5)]
+        ),
+        # Person name and title by flat
+        # Louis XIV -> Louis XIV
+        (
+            ["Louis", "XIV"],
+            [0, 0],
+            ["ROOT", "flat:name"],
+            ["PROPN", "PROPN"],
+            [(0,2)]
+        ),
+        # Organization name by flat
+        # Nations Unies -> Nations Unies
+        (
+            ["Nations", "Unies"],
+            [0, 0],
+            ["ROOT", "flat:name"],
+            ["PROPN", "PROPN"],
+            [(0,2)]
+        ),
+        # Noun compound, person name created by two flats
+        # Louise de Bratagne -> Louise de Bratagne
+        (
+            ["Louise", "de", "Bratagne"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Noun compound, person name created by two flats
+        # Louis François Joseph -> Louis François Joseph
+        (
+            ["Louis", "François", "Joseph"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # one determiner + one noun + one adjective qualified by an adverb
+        # quelques agriculteurs très riches -> quelques agriculteurs très riches
+        (
+            ["quelques", "agriculteurs", "très", "riches"],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'advmod', 'amod'],
+            ['DET', 'NOUN', 'ADV', 'ADJ'],
+            [(0,4)]
+        ),
+        # Two NPs conjuncted
+        # Il a un chien et un chat -> Il, un chien, un chat
+        ( 
+            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
+            [1, 1, 3, 1, 6, 6, 3],
+            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,1), (2,4), (5,7)]
+         
+        ),
+        # Two NPs together
+        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
+        (
+            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
+            [1, 1, 1, 1, 3],
+            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
+            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+            [(0, 3), (3, 5)]
+        ),
+        # nmod relation between NPs
+        # la destruction de la ville -> la destruction, la ville
+        (
+            ['la', 'destruction', 'de', 'la', 'ville'],
+            [1, 1, 4, 4, 1],
+            ['det', 'ROOT', 'case', 'det', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
+            [(0,2), (3,5)]
+        ),
+        # nmod relation between NPs
+        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
+        (
+            ['Archiduchesse', 'd’', 'Autriche'],
+            [0, 2, 0],
+            ['ROOT', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3)]
+        ),
+        # Compounding by nmod, several NPs chained together
+        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
+        (
+            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
+            [2, 2, 2, 4, 2, 6, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(0, 3), (4, 5), (6, 7)]
+        ),
+        # several NPs
+        # Traduction du rapport de Susana -> Traduction, rapport, Susana
+        (
+            ['Traduction', 'du', 'raport', 'de', 'Susana'],
+            [0, 2, 0, 4, 2],
+            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3), (4,5)]  
+       
+        ),
+        # Several NPs
+        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
+        (  
+            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
+            [2, 2, 2, 4, 2, 7, 7, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,8)]
+        ),
+        # Passive subject
+        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
+        (
+            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
+            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
+            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
+            [(0, 3), (6, 10), (11, 12)]
+        )
+    ],
+)
+# fmt: on
+def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
     """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
-    doc = fr_tokenizer("trouver des travaux antérieurs")
+    doc = fr_tokenizer("Je suis allé à l'école")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py
new file mode 100644
index 000000000..0a8c10e79
--- /dev/null
+++ b/spacy/tests/lang/it/test_noun_chunks.py
@@ -0,0 +1,221 @@
+from spacy.tokens import Doc
+import pytest
+
+
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # determiner + noun
+        # un pollo -> un pollo
+        (
+            ["un", "pollo"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0,2)],
+        ),
+        # two determiners + noun
+        # il mio cane -> il mio cane
+        (
+            ["il", "mio", "cane"],
+            [2, 2, 2],
+            ["det", "det:poss", "ROOT"],
+            ["DET", "DET", "NOUN"],
+            [(0,3)],
+        ),
+        # two determiners, one is after noun. rare usage but still testing
+        # il cane mio-> il cane mio
+        (
+            ["il", "cane", "mio"],
+            [1, 1, 1],
+            ["det", "ROOT", "det:poss"],
+            ["DET", "NOUN", "DET"],
+            [(0,3)],
+        ),
+        # relative pronoun
+        # È molto bello il vestito che hai acquistat -> il vestito, che   the dress that you bought is very pretty.
+        (
+            ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
+            [2, 2, 2, 4, 2, 7, 7, 4],
+            ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
+            ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
+            [(3,5), (5,6)]
+        ),
+        # relative subclause
+        # il computer che hai comprato -> il computer, che     the computer that you bought
+        (
+            ['il', 'computer', 'che', 'hai', 'comprato'],
+            [1, 1, 4, 4, 1],
+            ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
+            ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
+            [(0,2), (2,3)]
+        ),
+        # det + noun + adj
+        # Una macchina grande  -> Una macchina grande
+        (
+            ["Una", "macchina", "grande"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0,3)],
+        ),
+        # noun + adj plural
+        # mucche bianche 
+        (
+            ["mucche", "bianche"],
+            [0, 0],
+            ["ROOT", "amod"],
+            ["NOUN", "ADJ"],
+            [(0,2)],
+        ),
+        # det + adj + noun
+        # Una grande macchina -> Una grande macchina
+        (
+            ['Una', 'grande', 'macchina'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # det + adj + noun, det with apostrophe
+        # un'importante associazione -> un'importante associazione
+        (
+            ["Un'", 'importante', 'associazione'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # multiple adjectives
+        # Un cane piccolo e marrone -> Un cane piccolo e marrone
+        (
+            ["Un", "cane", "piccolo", "e", "marrone"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # determiner, adjective, compound created by flat
+        # le Nazioni Unite -> le Nazioni Unite
+        (
+            ["le", "Nazioni", "Unite"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat:name"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # one determiner + one noun + one adjective qualified by an adverb
+        # alcuni contadini molto ricchi -> alcuni contadini molto ricchi     some very rich farmers
+        (
+            ['alcuni', 'contadini', 'molto', 'ricchi'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'advmod', 'amod'],
+            ['DET', 'NOUN', 'ADV', 'ADJ'],
+            [(0,4)]
+        ),
+        # Two NPs conjuncted
+        # Ho un cane e un gatto -> un cane, un gatto
+        ( 
+            ['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
+            [0, 2, 0, 5, 5, 0],
+            ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+            ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+            [(1,3), (4,6)]
+         
+        ),
+        # Two NPs together
+        # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
+        (
+            ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
+            [1, 1, 1, 1, 3],
+            ['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
+            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+            [(0, 3), (3, 5)]
+        ),
+        # Noun compound, person name and titles
+        # Dom Pedro II -> Dom Pedro II
+        (
+            ["Dom", "Pedro", "II"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Noun compound created by flat
+        # gli Stati Uniti
+        (
+            ["gli", "Stati", "Uniti"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat:name"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # nmod relation between NPs
+        # la distruzione della città -> la distruzione, città
+        (
+            ['la', 'distruzione', 'della', 'città'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'case', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'NOUN'],
+            [(0,2), (3,4)]
+        ),
+        # Compounding by nmod, several NPs chained together
+        # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
+        (
+            ["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
+            [2, 2, 2, 4, 2, 6, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(0, 3), (4, 5), (6, 7)]
+        ),
+        # several NPs
+        # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
+        (
+            ['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
+            [0, 2, 0, 4, 2],
+            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3), (4,5)]  
+       
+        ),
+        # Several NPs
+        # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
+        (  
+            ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
+            [1, 1, 1, 4, 1, 8, 8, 8, 1],
+            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
+            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,9)]
+        ),
+        # Passive subject
+        # La nuova spesa è alimentata dal grande conto in banca di Clinton  -> Le nuova spesa, grande conto, banca, Clinton
+        (
+            ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
+            [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
+            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0, 3), (6, 8), (9, 10), (11,12)]
+        ),
+        # Misc
+        # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
+        (
+            ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
+            [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
+            ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
+            ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(2,4), (9,12), (13,14), (17,18), (19,20)]
+        )
+    ],
+)
+# fmt: on
+def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
+def test_noun_chunks_is_parsed_it(it_tokenizer):
+    """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
+    doc = it_tokenizer("Sei andato a Oxford")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)

From 6a8619dd736f03e0fa8eec173a9277a3adbc46f9 Mon Sep 17 00:00:00 2001
From: Tuomo Hiippala <tuomo.hiippala@iki.fi>
Date: Mon, 17 Jan 2022 09:28:51 +0200
Subject: [PATCH 121/133] Update the entry for Applied Language Technology in
 spaCy Universe (#10068)

* add entry for Applied Language Technology under "Courses"

Added the following entry into `universe.json`:

```
        {
            "type": "education",
            "id": "applt-course",
            "title": "Applied Language Technology",
            "slogan": "NLP for newcomers using spaCy and Stanza",
            "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
            "url": "https://applied-language-technology.readthedocs.io/",
            "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
            "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png",
            "author": "Tuomo Hiippala",
            "author_links": {
                "twitter": "tuomo_h",
                "github": "thiippal",
                "website": "https://www.mv.helsinki.fi/home/thiippal/"
            },
            "category": ["courses"]
        },
```

* Update the entry for "Applied Language Technology"
---
 website/meta/universe.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 384a7e070..0fde2d612 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1770,9 +1770,9 @@
             "title": "Applied Language Technology",
             "slogan": "NLP for newcomers using spaCy and Stanza",
             "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
-            "url": "https://applied-language-technology.readthedocs.io/",
+            "url": "https://applied-language-technology.mooc.fi",
             "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
-            "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png",
+            "thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png",
             "author": "Tuomo Hiippala",
             "author_links": {
                 "twitter": "tuomo_h",

From add52935ff273c9c8f37ae244803aebe02c12193 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 17 Jan 2022 10:38:37 +0100
Subject: [PATCH 122/133] Revert "Bump sudachipy version (#9917)" (#10071)

This reverts commit 58bdd8607bb917f3437fdf5993dec5b6e58930c8.
---
 setup.cfg                               |   4 +-
 spacy/lang/char_classes.py              |   5 -
 spacy/lang/fr/syntax_iterators.py       |  72 ++------
 spacy/lang/it/__init__.py               |   4 +-
 spacy/lang/it/syntax_iterators.py       |  86 ---------
 spacy/lang/nb/stop_words.py             |  30 ++--
 spacy/lang/sl/stop_words.py             | 130 +++++++++++++-
 spacy/tests/conftest.py                 |  10 --
 spacy/tests/lang/fr/test_noun_chunks.py | 224 +-----------------------
 spacy/tests/lang/it/test_noun_chunks.py | 221 -----------------------
 10 files changed, 162 insertions(+), 624 deletions(-)
 delete mode 100644 spacy/lang/it/syntax_iterators.py
 delete mode 100644 spacy/tests/lang/it/test_noun_chunks.py

diff --git a/setup.cfg b/setup.cfg
index 586a044ff..50e982cbf 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -108,8 +108,8 @@ apple =
     thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
-    sudachipy>=0.5.2,!=0.6.1
-    sudachidict_core>=20211220
+    sudachipy>=0.4.9
+    sudachidict_core>=20200330
 ko =
     natto-py==0.9.0
 th =
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index b15bb3cf3..9e5441a4f 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -45,10 +45,6 @@ _hangul_syllables = r"\uAC00-\uD7AF"
 _hangul_jamo = r"\u1100-\u11FF"
 _hangul = _hangul_syllables + _hangul_jamo
 
-_hiragana = r"\u3040-\u309F"
-_katakana = r"\u30A0-\u30FFー"
-_kana = _hiragana + _katakana
-
 # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
 _latin_u_extendedA = (
     r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@@ -248,7 +244,6 @@ _uncased = (
     + _tamil
     + _telugu
     + _hangul
-    + _kana
     + _cjk
 )
 
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 5f7ba5c10..d86662693 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -6,35 +6,16 @@ from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """
-    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
-    """
-    labels = [
-        "nsubj",
-        "nsubj:pass",
-        "obj",
-        "obl",
-        "obl:agent",
-        "obl:arg",
-        "obl:mod",
-        "nmod",
-        "pcomp",
-        "appos",
-        "ROOT",
-    ]
-    post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
+    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
+    # fmt: off
+    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
+    # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
     if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
-    np_deps = {doc.vocab.strings.add(label) for label in labels}
-    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add("conj")
     np_label = doc.vocab.strings.add("NP")
-    adj_label = doc.vocab.strings.add("amod")
-    det_label = doc.vocab.strings.add("det")
-    det_pos = doc.vocab.strings.add("DET")
-    adp_pos = doc.vocab.strings.add("ADP")
-    conj_label = doc.vocab.strings.add("conj")
-    conj_pos = doc.vocab.strings.add("CCONJ")
     prev_end = -1
     for i, word in enumerate(doclike):
         if word.pos not in (NOUN, PROPN, PRON):
@@ -43,45 +24,16 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
         if word.left_edge.i <= prev_end:
             continue
         if word.dep in np_deps:
-            right_childs = list(word.rights)
-            right_child = right_childs[0] if right_childs else None
-
-            if right_child:
-                if (
-                    right_child.dep == adj_label
-                ):  # allow chain of adjectives by expanding to right
-                    right_end = right_child.right_edge
-                elif (
-                    right_child.dep == det_label and right_child.pos == det_pos
-                ):  # cut relative pronouns here
-                    right_end = right_child
-                elif right_child.dep in np_modifs:  # Check if we can expand to right
-                    right_end = word.right_edge
-                else:
-                    right_end = word
-            else:
-                right_end = word
-            prev_end = right_end.i
-
-            left_index = word.left_edge.i
-            left_index = (
-                left_index + 1 if word.left_edge.pos == adp_pos else left_index
-            )
-
-            yield left_index, right_end.i + 1, np_label
-        elif word.dep == conj_label:
+            prev_end = word.right_edge.i
+            yield word.left_edge.i, word.right_edge.i + 1, np_label
+        elif word.dep == conj:
             head = word.head
-            while head.dep == conj_label and head.head.i < head.i:
+            while head.dep == conj and head.head.i < head.i:
                 head = head.head
             # If the head is an NP, and we're coordinated to it, we're an NP
             if head.dep in np_deps:
-                prev_end = word.i
-
-                left_index = word.left_edge.i  # eliminate left attached conjunction
-                left_index = (
-                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
-                )
-                yield left_index, word.i + 1, np_label
+                prev_end = word.right_edge.i
+                yield word.left_edge.i, word.right_edge.i + 1, np_label
 
 
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index ecf322bd7..1edebc837 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -6,15 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults
 from .lemmatizer import ItalianLemmatizer
-from .syntax_iterators import SYNTAX_ITERATORS
 
 
 class ItalianDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    stop_words = STOP_WORDS
     prefixes = TOKENIZER_PREFIXES
     infixes = TOKENIZER_INFIXES
-    stop_words = STOP_WORDS
-    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Italian(Language):
diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py
deleted file mode 100644
index f63df3fad..000000000
--- a/spacy/lang/it/syntax_iterators.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from typing import Union, Iterator, Tuple
-
-from ...symbols import NOUN, PROPN, PRON
-from ...errors import Errors
-from ...tokens import Doc, Span
-
-
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """
-    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
-    """
-    labels = [
-        "nsubj",
-        "nsubj:pass",
-        "obj",
-        "obl",
-        "obl:agent",
-        "nmod",
-        "pcomp",
-        "appos",
-        "ROOT",
-    ]
-    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
-    dets = ["det", "det:poss"]
-    doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.has_annotation("DEP"):
-        raise ValueError(Errors.E029)
-    np_deps = {doc.vocab.strings.add(label) for label in labels}
-    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
-    np_label = doc.vocab.strings.add("NP")
-    adj_label = doc.vocab.strings.add("amod")
-    det_labels = {doc.vocab.strings.add(det) for det in dets}
-    det_pos = doc.vocab.strings.add("DET")
-    adp_label = doc.vocab.strings.add("ADP")
-    conj = doc.vocab.strings.add("conj")
-    conj_pos = doc.vocab.strings.add("CCONJ")
-    prev_end = -1
-    for i, word in enumerate(doclike):
-        if word.pos not in (NOUN, PROPN, PRON):
-            continue
-        # Prevent nested chunks from being produced
-        if word.left_edge.i <= prev_end:
-            continue
-        if word.dep in np_deps:
-            right_childs = list(word.rights)
-            right_child = right_childs[0] if right_childs else None
-
-            if right_child:
-                if (
-                    right_child.dep == adj_label
-                ):  # allow chain of adjectives by expanding to right
-                    right_end = right_child.right_edge
-                elif (
-                    right_child.dep in det_labels and right_child.pos == det_pos
-                ):  # cut relative pronouns here
-                    right_end = right_child
-                elif right_child.dep in np_modifs:  # Check if we can expand to right
-                    right_end = word.right_edge
-                else:
-                    right_end = word
-            else:
-                right_end = word
-            prev_end = right_end.i
-
-            left_index = word.left_edge.i
-            left_index = (
-                left_index + 1 if word.left_edge.pos == adp_label else left_index
-            )
-
-            yield left_index, right_end.i + 1, np_label
-        elif word.dep == conj:
-            head = word.head
-            while head.dep == conj and head.head.i < head.i:
-                head = head.head
-            # If the head is an NP, and we're coordinated to it, we're an NP
-            if head.dep in np_deps:
-                prev_end = word.i
-
-                left_index = word.left_edge.i  # eliminate left attached conjunction
-                left_index = (
-                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
-                )
-                yield left_index, word.i + 1, np_label
-
-
-SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py
index d9ed414ef..fd65dd788 100644
--- a/spacy/lang/nb/stop_words.py
+++ b/spacy/lang/nb/stop_words.py
@@ -4,42 +4,46 @@ alle allerede alt and andre annen annet at av
 
 bak bare bedre beste blant ble bli blir blitt bris by både
 
-da dag de del dem den denne der dermed det dette disse du
+da dag de del dem den denne der dermed det dette disse drept du
 
 eller en enn er et ett etter
 
-fem fikk fire fjor flere folk for fortsatt fra fram
+fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
 funnet få får fått før først første
 
 gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
 
-ha hadde ham han hans har hele helt henne hennes her hun
+ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
+hvorfor
 
 i ifølge igjen ikke ingen inn
 
 ja jeg
 
 kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
+kvinner
 
-la laget land landet langt leder ligger like litt løpet
+la laget land landet langt leder ligger like litt løpet lørdag
 
-man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte
+man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
+millioner minutter mot msci mye må mål måtte
 
-ned neste noe noen nok ny nye nå når
+ned neste noe noen nok norge norsk norske ntb ny nye nå når
 
-og også om opp opplyser oss over
+og også om onsdag opp opplyser oslo oss over
 
-personer plass poeng på
+personer plass poeng politidistrikt politiet president prosent på
 
-runde rundt
+regjeringen runde rundt russland
 
-sa saken samme sammen samtidig satt se seg seks selv senere ser sett
+sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
 siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
-store står svært så
+store står sverige svært så søndag
 
-ta tatt tid tidligere til tilbake tillegg tok tror
+ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
+tyskland
 
-under ut uten utenfor
+under usa ut uten utenfor
 
 vant var ved veldig vi videre viktig vil ville viser vår være vært
 
diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py
index c9004ed5d..6fb01a183 100644
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@@ -1,10 +1,13 @@
 # Source: https://github.com/stopwords-iso/stopwords-sl
-# Removed various words that are not normally considered stop words, such as months.
+# TODO: probably needs to be tidied up – the list seems to have month names in
+# it, which shouldn't be considered stop words.
 
 STOP_WORDS = set(
     """
 a
 ali
+april
+avgust
 b
 bi
 bil
@@ -16,6 +19,7 @@ biti
 blizu
 bo
 bodo
+bojo
 bolj
 bom
 bomo
@@ -33,6 +37,16 @@ da
 daleč
 dan
 danes
+datum
+december
+deset
+deseta
+deseti
+deseto
+devet
+deveta
+deveti
+deveto
 do
 dober
 dobra
@@ -40,7 +54,16 @@ dobri
 dobro
 dokler
 dol
+dolg
+dolga
+dolgi
 dovolj
+drug
+druga
+drugi
+drugo
+dva
+dve
 e
 eden
 en
@@ -51,6 +74,7 @@ enkrat
 eno
 etc.
 f
+februar
 g
 g.
 ga
@@ -69,12 +93,16 @@ iv
 ix
 iz
 j
+januar
 jaz
 je
 ji
 jih
 jim
 jo
+julij
+junij
+jutri
 k
 kadarkoli
 kaj
@@ -95,23 +123,41 @@ kje
 kjer
 kjerkoli
 ko
+koder
 koderkoli
 koga
 komu
 kot
+kratek
+kratka
+kratke
+kratki
 l
+lahka
+lahke
+lahki
+lahko
 le
 lep
 lepa
 lepe
 lepi
 lepo
+leto
 m
+maj
+majhen
+majhna
+majhni
+malce
+malo
 manj
+marec
 me
 med
 medtem
 mene
+mesec
 mi
 midva
 midve
@@ -137,6 +183,7 @@ najmanj
 naju
 največ
 nam
+narobe
 nas
 nato
 nazaj
@@ -145,6 +192,7 @@ naša
 naše
 ne
 nedavno
+nedelja
 nek
 neka
 nekaj
@@ -188,6 +236,7 @@ njuna
 njuno
 no
 nocoj
+november
 npr.
 o
 ob
@@ -195,23 +244,51 @@ oba
 obe
 oboje
 od
+odprt
+odprta
+odprti
 okoli
+oktober
 on
 onadva
 one
 oni
 onidve
+osem
+osma
+osmi
+osmo
 oz.
 p
 pa
+pet
+peta
+petek
+peti
+peto
 po
 pod
 pogosto
 poleg
+poln
+polna
+polni
+polno
 ponavadi
+ponedeljek
 ponovno
 potem
 povsod
+pozdravljen
+pozdravljeni
+prav
+prava
+prave
+pravi
+pravo
+prazen
+prazna
+prazno
 prbl.
 precej
 pred
@@ -220,10 +297,19 @@ preko
 pri
 pribl.
 približno
+primer
+pripravljen
+pripravljena
+pripravljeni
 proti
+prva
+prvi
+prvo
 r
+ravno
 redko
 res
+reč
 s
 saj
 sam
@@ -235,17 +321,29 @@ se
 sebe
 sebi
 sedaj
+sedem
+sedma
+sedmi
+sedmo
 sem
+september
 seveda
 si
 sicer
 skoraj
 skozi
+slab
 smo
 so
+sobota
 spet
+sreda
+srednja
+srednji
 sta
 ste
+stran
+stvar
 sva
 t
 ta
@@ -260,6 +358,10 @@ te
 tebe
 tebi
 tega
+težak
+težka
+težki
+težko
 ti
 tista
 tiste
@@ -269,6 +371,11 @@ tj.
 tja
 to
 toda
+torek
+tretja
+tretje
+tretji
+tri
 tu
 tudi
 tukaj
@@ -285,6 +392,10 @@ vaša
 vaše
 ve
 vedno
+velik
+velika
+veliki
+veliko
 vendar
 ves
 več
@@ -292,6 +403,10 @@ vi
 vidva
 vii
 viii
+visok
+visoka
+visoke
+visoki
 vsa
 vsaj
 vsak
@@ -305,21 +420,34 @@ vsega
 vsi
 vso
 včasih
+včeraj
 x
 z
 za
 zadaj
 zadnji
 zakaj
+zaprta
+zaprti
+zaprto
 zdaj
 zelo
 zunaj
 č
 če
 često
+četrta
+četrtek
+četrti
+četrto
 čez
 čigav
 š
+šest
+šesta
+šesti
+šesto
+štiri
 ž
 že
 """.split()
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index ee90a9f38..ffca79bb9 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -155,11 +155,6 @@ def fr_tokenizer():
     return get_lang_class("fr")().tokenizer
 
 
-@pytest.fixture(scope="session")
-def fr_vocab():
-    return get_lang_class("fr")().vocab
-
-
 @pytest.fixture(scope="session")
 def ga_tokenizer():
     return get_lang_class("ga")().tokenizer
@@ -210,11 +205,6 @@ def it_tokenizer():
     return get_lang_class("it")().tokenizer
 
 
-@pytest.fixture(scope="session")
-def it_vocab():
-    return get_lang_class("it")().vocab
-
-
 @pytest.fixture(scope="session")
 def ja_tokenizer():
     pytest.importorskip("sudachipy")
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index 25b95f566..48ac88ead 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -1,230 +1,8 @@
-from spacy.tokens import Doc
 import pytest
 
 
-# fmt: off
-@pytest.mark.parametrize(
-    "words,heads,deps,pos,chunk_offsets",
-    [
-        # determiner + noun
-        # un nom -> un nom
-        (
-            ["un", "nom"],
-            [1, 1],
-            ["det", "ROOT"],
-            ["DET", "NOUN"],
-            [(0, 2)],
-        ),
-        # determiner + noun starting with vowel
-        # l'heure -> l'heure
-        (
-            ["l'", "heure"],
-            [1, 1],
-            ["det", "ROOT"],
-            ["DET", "NOUN"],
-            [(0, 2)],
-        ),
-        # determiner + plural noun
-        # les romans -> les romans
-        (
-            ["les", "romans"],
-            [1, 1],
-            ["det", "ROOT"],
-            ["DET", "NOUN"],
-            [(0, 2)],
-        ),
-        # det + adj + noun
-        # Le vieux Londres  -> Le vieux Londres 
-        (
-            ['Les', 'vieux', 'Londres'],
-            [2, 2, 2],
-            ["det", "amod", "ROOT"],
-            ["DET", "ADJ", "NOUN"],
-            [(0,3)]
-        ),
-        # det + noun + adj
-        # le nom propre  -> le nom propre   a proper noun
-        (
-            ["le", "nom", "propre"],
-            [1, 1, 1],
-            ["det", "ROOT", "amod"],
-            ["DET", "NOUN", "ADJ"],
-            [(0, 3)],
-        ),
-        # det + noun + adj plural
-        # Les chiens bruns  -> les chiens bruns
-        (
-            ["Les", "chiens", "bruns"],
-            [1, 1, 1],
-            ["det", "ROOT", "amod"],
-            ["DET", "NOUN", "ADJ"],
-            [(0, 3)],
-        ),
-        # multiple adjectives: one adj before the noun, one adj after the noun
-        # un nouveau film intéressant -> un nouveau film intéressant
-        (
-            ["un", "nouveau", "film", "intéressant"],
-            [2, 2, 2, 2],
-            ["det", "amod", "ROOT", "amod"],
-            ["DET", "ADJ", "NOUN", "ADJ"],
-            [(0,4)]
-        ),
-        # multiple adjectives, both adjs after the noun
-        # une personne intelligente et drôle -> une personne intelligente et drôle
-        (
-            ["une", "personne", "intelligente", "et", "drôle"],
-            [1, 1, 1, 4, 2],
-            ["det", "ROOT", "amod", "cc", "conj"],
-            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
-            [(0,5)]
-        ),
-        # relative pronoun
-        # un bus qui va au ville -> un bus, qui, ville
-        (
-            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
-            [1, 1, 3, 1, 5, 3],
-            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
-            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
-            [(0,2), (2,3), (5,6)]
-        ),
-        # relative subclause
-        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
-        (
-            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
-            [0, 2, 0, 5, 5, 2, 5],
-            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
-            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
-            [(1,3), (4,5)]
-        ),
-        # Person name and title by flat
-        # Louis XIV -> Louis XIV
-        (
-            ["Louis", "XIV"],
-            [0, 0],
-            ["ROOT", "flat:name"],
-            ["PROPN", "PROPN"],
-            [(0,2)]
-        ),
-        # Organization name by flat
-        # Nations Unies -> Nations Unies
-        (
-            ["Nations", "Unies"],
-            [0, 0],
-            ["ROOT", "flat:name"],
-            ["PROPN", "PROPN"],
-            [(0,2)]
-        ),
-        # Noun compound, person name created by two flats
-        # Louise de Bratagne -> Louise de Bratagne
-        (
-            ["Louise", "de", "Bratagne"],
-            [0, 0, 0],
-            ["ROOT", "flat:name", "flat:name"],
-            ["PROPN", "PROPN", "PROPN"],
-            [(0,3)]
-        ),
-        # Noun compound, person name created by two flats
-        # Louis François Joseph -> Louis François Joseph
-        (
-            ["Louis", "François", "Joseph"],
-            [0, 0, 0],
-            ["ROOT", "flat:name", "flat:name"],
-            ["PROPN", "PROPN", "PROPN"],
-            [(0,3)]
-        ),
-        # one determiner + one noun + one adjective qualified by an adverb
-        # quelques agriculteurs très riches -> quelques agriculteurs très riches
-        (
-            ["quelques", "agriculteurs", "très", "riches"],
-            [1, 1, 3, 1],
-            ['det', 'ROOT', 'advmod', 'amod'],
-            ['DET', 'NOUN', 'ADV', 'ADJ'],
-            [(0,4)]
-        ),
-        # Two NPs conjuncted
-        # Il a un chien et un chat -> Il, un chien, un chat
-        ( 
-            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
-            [1, 1, 3, 1, 6, 6, 3],
-            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
-            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
-            [(0,1), (2,4), (5,7)]
-         
-        ),
-        # Two NPs together
-        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
-        (
-            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
-            [1, 1, 1, 1, 3],
-            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
-            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
-            [(0, 3), (3, 5)]
-        ),
-        # nmod relation between NPs
-        # la destruction de la ville -> la destruction, la ville
-        (
-            ['la', 'destruction', 'de', 'la', 'ville'],
-            [1, 1, 4, 4, 1],
-            ['det', 'ROOT', 'case', 'det', 'nmod'],
-            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
-            [(0,2), (3,5)]
-        ),
-        # nmod relation between NPs
-        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
-        (
-            ['Archiduchesse', 'd’', 'Autriche'],
-            [0, 2, 0],
-            ['ROOT', 'case', 'nmod'],
-            ['NOUN', 'ADP', 'PROPN'],
-            [(0,1), (2,3)]
-        ),
-        # Compounding by nmod, several NPs chained together
-        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
-        (
-            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
-            [2, 2, 2, 4, 2, 6, 2],
-            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
-            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
-            [(0, 3), (4, 5), (6, 7)]
-        ),
-        # several NPs
-        # Traduction du rapport de Susana -> Traduction, rapport, Susana
-        (
-            ['Traduction', 'du', 'raport', 'de', 'Susana'],
-            [0, 2, 0, 4, 2],
-            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
-            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
-            [(0,1), (2,3), (4,5)]  
-       
-        ),
-        # Several NPs
-        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
-        (  
-            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
-            [2, 2, 2, 4, 2, 7, 7, 2],
-            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
-            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
-            [(0,3), (4,5), (6,8)]
-        ),
-        # Passive subject
-        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
-        (
-            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
-            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
-            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
-            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
-            [(0, 3), (6, 10), (11, 12)]
-        )
-    ],
-)
-# fmt: on
-def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
-    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
-    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
-
-
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
     """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
-    doc = fr_tokenizer("Je suis allé à l'école")
+    doc = fr_tokenizer("trouver des travaux antérieurs")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py
deleted file mode 100644
index 0a8c10e79..000000000
--- a/spacy/tests/lang/it/test_noun_chunks.py
+++ /dev/null
@@ -1,221 +0,0 @@
-from spacy.tokens import Doc
-import pytest
-
-
-# fmt: off
-@pytest.mark.parametrize(
-    "words,heads,deps,pos,chunk_offsets",
-    [
-        # determiner + noun
-        # un pollo -> un pollo
-        (
-            ["un", "pollo"],
-            [1, 1],
-            ["det", "ROOT"],
-            ["DET", "NOUN"],
-            [(0,2)],
-        ),
-        # two determiners + noun
-        # il mio cane -> il mio cane
-        (
-            ["il", "mio", "cane"],
-            [2, 2, 2],
-            ["det", "det:poss", "ROOT"],
-            ["DET", "DET", "NOUN"],
-            [(0,3)],
-        ),
-        # two determiners, one is after noun. rare usage but still testing
-        # il cane mio-> il cane mio
-        (
-            ["il", "cane", "mio"],
-            [1, 1, 1],
-            ["det", "ROOT", "det:poss"],
-            ["DET", "NOUN", "DET"],
-            [(0,3)],
-        ),
-        # relative pronoun
-        # È molto bello il vestito che hai acquistat -> il vestito, che   the dress that you bought is very pretty.
-        (
-            ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
-            [2, 2, 2, 4, 2, 7, 7, 4],
-            ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
-            ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
-            [(3,5), (5,6)]
-        ),
-        # relative subclause
-        # il computer che hai comprato -> il computer, che     the computer that you bought
-        (
-            ['il', 'computer', 'che', 'hai', 'comprato'],
-            [1, 1, 4, 4, 1],
-            ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
-            ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
-            [(0,2), (2,3)]
-        ),
-        # det + noun + adj
-        # Una macchina grande  -> Una macchina grande
-        (
-            ["Una", "macchina", "grande"],
-            [1, 1, 1],
-            ["det", "ROOT", "amod"],
-            ["DET", "NOUN", "ADJ"],
-            [(0,3)],
-        ),
-        # noun + adj plural
-        # mucche bianche 
-        (
-            ["mucche", "bianche"],
-            [0, 0],
-            ["ROOT", "amod"],
-            ["NOUN", "ADJ"],
-            [(0,2)],
-        ),
-        # det + adj + noun
-        # Una grande macchina -> Una grande macchina
-        (
-            ['Una', 'grande', 'macchina'],
-            [2, 2, 2],
-            ["det", "amod", "ROOT"],
-            ["DET", "ADJ", "NOUN"],
-            [(0,3)]
-        ),
-        # det + adj + noun, det with apostrophe
-        # un'importante associazione -> un'importante associazione
-        (
-            ["Un'", 'importante', 'associazione'],
-            [2, 2, 2],
-            ["det", "amod", "ROOT"],
-            ["DET", "ADJ", "NOUN"],
-            [(0,3)]
-        ),
-        # multiple adjectives
-        # Un cane piccolo e marrone -> Un cane piccolo e marrone
-        (
-            ["Un", "cane", "piccolo", "e", "marrone"],
-            [1, 1, 1, 4, 2],
-            ["det", "ROOT", "amod", "cc", "conj"],
-            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
-            [(0,5)]
-        ),
-        # determiner, adjective, compound created by flat
-        # le Nazioni Unite -> le Nazioni Unite
-        (
-            ["le", "Nazioni", "Unite"],
-            [1, 1, 1],
-            ["det", "ROOT", "flat:name"],
-            ["DET", "PROPN", "PROPN"],
-            [(0,3)]
-        ),
-        # one determiner + one noun + one adjective qualified by an adverb
-        # alcuni contadini molto ricchi -> alcuni contadini molto ricchi     some very rich farmers
-        (
-            ['alcuni', 'contadini', 'molto', 'ricchi'],
-            [1, 1, 3, 1],
-            ['det', 'ROOT', 'advmod', 'amod'],
-            ['DET', 'NOUN', 'ADV', 'ADJ'],
-            [(0,4)]
-        ),
-        # Two NPs conjuncted
-        # Ho un cane e un gatto -> un cane, un gatto
-        ( 
-            ['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
-            [0, 2, 0, 5, 5, 0],
-            ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
-            ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
-            [(1,3), (4,6)]
-         
-        ),
-        # Two NPs together
-        # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
-        (
-            ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
-            [1, 1, 1, 1, 3],
-            ['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
-            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
-            [(0, 3), (3, 5)]
-        ),
-        # Noun compound, person name and titles
-        # Dom Pedro II -> Dom Pedro II
-        (
-            ["Dom", "Pedro", "II"],
-            [0, 0, 0],
-            ["ROOT", "flat:name", "flat:name"],
-            ["PROPN", "PROPN", "PROPN"],
-            [(0,3)]
-        ),
-        # Noun compound created by flat
-        # gli Stati Uniti
-        (
-            ["gli", "Stati", "Uniti"],
-            [1, 1, 1],
-            ["det", "ROOT", "flat:name"],
-            ["DET", "PROPN", "PROPN"],
-            [(0,3)]
-        ),
-        # nmod relation between NPs
-        # la distruzione della città -> la distruzione, città
-        (
-            ['la', 'distruzione', 'della', 'città'],
-            [1, 1, 3, 1],
-            ['det', 'ROOT', 'case', 'nmod'],
-            ['DET', 'NOUN', 'ADP', 'NOUN'],
-            [(0,2), (3,4)]
-        ),
-        # Compounding by nmod, several NPs chained together
-        # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
-        (
-            ["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
-            [2, 2, 2, 4, 2, 6, 2],
-            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
-            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
-            [(0, 3), (4, 5), (6, 7)]
-        ),
-        # several NPs
-        # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
-        (
-            ['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
-            [0, 2, 0, 4, 2],
-            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
-            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
-            [(0,1), (2,3), (4,5)]  
-       
-        ),
-        # Several NPs
-        # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
-        (  
-            ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
-            [1, 1, 1, 4, 1, 8, 8, 8, 1],
-            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
-            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
-            [(0,3), (4,5), (6,9)]
-        ),
-        # Passive subject
-        # La nuova spesa è alimentata dal grande conto in banca di Clinton  -> Le nuova spesa, grande conto, banca, Clinton
-        (
-            ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
-            [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
-            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
-            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
-            [(0, 3), (6, 8), (9, 10), (11,12)]
-        ),
-        # Misc
-        # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
-        (
-            ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
-            [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
-            ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
-            ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
-            [(2,4), (9,12), (13,14), (17,18), (19,20)]
-        )
-    ],
-)
-# fmt: on
-def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
-    doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
-    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
-
-
-def test_noun_chunks_is_parsed_it(it_tokenizer):
-    """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
-    doc = it_tokenizer("Sei andato a Oxford")
-    with pytest.raises(ValueError):
-        list(doc.noun_chunks)

From 39f1b13e7729c5fa41fd28972539cc35fce9398a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 17 Jan 2022 11:48:39 +0100
Subject: [PATCH 123/133] Update sudachipy extras (#10072)

By @polm, redone from #9917 after incorrect (reverted) rebase.

`sudachipy>=0.5.2` is needed for newer dictionaries. `sudachipy<0.6.0`
is kept for users who might still prefer the older version, in
particular to be able to compile it without rust.
---
 setup.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 50e982cbf..586a044ff 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -108,8 +108,8 @@ apple =
     thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
-    sudachipy>=0.4.9
-    sudachidict_core>=20200330
+    sudachipy>=0.5.2,!=0.6.1
+    sudachidict_core>=20211220
 ko =
     natto-py==0.9.0
 th =

From c28e33637bf7c7beef8658db7bfc33182adeca87 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 18 Jan 2022 17:36:28 +0900
Subject: [PATCH 124/133] Mark flaky spancat test so it doesn't fail the build
 (#10075)

* Mark flaky spancat test so it doesn't fail the build

* Skip, don't run and ignore
---
 spacy/tests/pipeline/test_spancat.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 2f7e952d3..39d2e97da 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -79,7 +79,8 @@ def test_explicit_labels():
     nlp.initialize()
     assert spancat.labels == ("PERSON", "LOC")
 
-
+#TODO figure out why this is flaky
+@pytest.mark.skip(reason="Test is unreliable for unknown reason")
 def test_doc_gc():
     # If the Doc object is garbage collected, the spans won't be functional afterwards
     nlp = Language()
@@ -97,6 +98,7 @@ def test_doc_gc():
         assert isinstance(spangroups, SpanGroups)
         for key, spangroup in spangroups.items():
             assert isinstance(spangroup, SpanGroup)
+            # XXX This fails with length 0 sometimes
             assert len(spangroup) > 0
             with pytest.raises(RuntimeError):
                 span = spangroup[0]

From 4dfd559e5569f73846d5280d86487104f8550b0d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 18 Jan 2022 17:12:42 +0100
Subject: [PATCH 125/133] Fix spaces in Doc.from_docs for empty docs (#10052)

Fix spaces in `Doc.from_docs(ensure_whitespace=True)` for cases where an
doc ending in whitespace is followed by an empty doc.
---
 spacy/tests/doc/test_doc_api.py | 5 +++--
 spacy/tokens/doc.pyx            | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index c6195d7e2..10700b787 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
         "Merging the docs is fun.",
         "",
         "They don't think alike. ",
+        "",
         "Another doc.",
     ]
     en_texts_without_empty = [t for t in en_texts if len(t)]
@@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     en_docs = [en_tokenizer(text) for text in en_texts]
     en_docs[0].spans["group"] = [en_docs[0][1:4]]
     en_docs[2].spans["group"] = [en_docs[2][1:4]]
-    en_docs[3].spans["group"] = [en_docs[3][0:1]]
+    en_docs[4].spans["group"] = [en_docs[4][0:1]]
     span_group_texts = sorted(
-        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
+        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
     )
     de_doc = de_tokenizer(de_text)
     Token.set_extension("is_ambiguous", default=False)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 362a17784..2f82a0d1b 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1183,7 +1183,7 @@ cdef class Doc:
                 token_offset = -1
                 for doc in docs[:-1]:
                     token_offset += len(doc)
-                    if not (len(doc) > 0 and doc[-1].is_space):
+                    if len(doc) > 0 and not doc[-1].is_space:
                         concat_spaces[token_offset] = True
 
         concat_array = numpy.concatenate(arrays)

From 50d2a2c93071f4d96606ba0d5985c54b59184cbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 18 Jan 2022 17:14:35 +0100
Subject: [PATCH 126/133] User fewer Vector internals (#9879)

* Use Vectors.shape rather than Vectors.data.shape

* Use Vectors.size rather than Vectors.data.size

* Add Vectors.to_ops to move data between different ops

* Add documentation for Vector.to_ops
---
 spacy/language.py                         |  8 ++++----
 spacy/ml/models/multi_task.py             |  4 ++--
 spacy/ml/staticvectors.py                 |  2 +-
 spacy/tests/vocab_vectors/test_vectors.py | 10 +++++-----
 spacy/tokens/doc.pyx                      |  4 ++--
 spacy/tokens/span.pyx                     |  2 +-
 spacy/training/initialize.py              |  2 +-
 spacy/vectors.pyx                         |  7 +++++--
 spacy/vocab.pyx                           |  4 ++--
 website/docs/api/vectors.md               | 17 +++++++++++++++++
 10 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 638616316..798254b80 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1285,9 +1285,9 @@ class Language:
             )
         except IOError:
             raise IOError(Errors.E884.format(vectors=I["vectors"]))
-        if self.vocab.vectors.data.shape[1] >= 1:
+        if self.vocab.vectors.shape[1] >= 1:
             ops = get_current_ops()
-            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+            self.vocab.vectors.to_ops(ops)
         if hasattr(self.tokenizer, "initialize"):
             tok_settings = validate_init_settings(
                 self.tokenizer.initialize,  # type: ignore[union-attr]
@@ -1332,8 +1332,8 @@ class Language:
         DOCS: https://spacy.io/api/language#resume_training
         """
         ops = get_current_ops()
-        if self.vocab.vectors.data.shape[1] >= 1:
-            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+        if self.vocab.vectors.shape[1] >= 1:
+            self.vocab.vectors.to_ops(ops)
         for name, proc in self.pipeline:
             if hasattr(proc, "_rehearsal_model"):
                 proc._rehearsal_model = deepcopy(proc.model)  # type: ignore[attr-defined]
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 37473b7f4..9e1face63 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -23,7 +23,7 @@ def create_pretrain_vectors(
     maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
     def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
-        if vocab.vectors.data.shape[1] == 0:
+        if vocab.vectors.shape[1] == 0:
             raise ValueError(Errors.E875)
         model = build_cloze_multi_task_model(
             vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
@@ -116,7 +116,7 @@ def build_multi_task_model(
 def build_cloze_multi_task_model(
     vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
 ) -> Model:
-    nO = vocab.vectors.data.shape[1]
+    nO = vocab.vectors.shape[1]
     output_layer = chain(
         cast(Model[List["Floats2d"], Floats2d], list2array()),
         Maxout(
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 8dd65833b..8d9b1af9b 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -94,7 +94,7 @@ def init(
     nM = model.get_dim("nM") if model.has_dim("nM") else None
     nO = model.get_dim("nO") if model.has_dim("nO") else None
     if X is not None and len(X):
-        nM = X[0].vocab.vectors.data.shape[1]
+        nM = X[0].vocab.vectors.shape[1]
     if Y is not None:
         nO = Y.data.shape[1]
 
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 9dc40b499..0650a7487 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -421,7 +421,7 @@ def test_vector_is_oov():
 def test_init_vectors_unset():
     v = Vectors(shape=(10, 10))
     assert v.is_full is False
-    assert v.data.shape == (10, 10)
+    assert v.shape == (10, 10)
 
     with pytest.raises(ValueError):
         v = Vectors(shape=(10, 10), mode="floret")
@@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
     # rows: 2 rows per ngram
     rows = OPS.xp.asarray(
         [
-            h % nlp.vocab.vectors.data.shape[0]
+            h % nlp.vocab.vectors.shape[0]
             for ngram in ngrams
             for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
         ],
@@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
     # an empty key returns 0s
     assert_equal(
         OPS.to_numpy(nlp.vocab[""].vector),
-        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+        numpy.zeros((nlp.vocab.vectors.shape[0],)),
     )
     # an empty batch returns 0s
     assert_equal(
         OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
-        numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
+        numpy.zeros((1, nlp.vocab.vectors.shape[0])),
     )
     # an empty key within a batch returns 0s
     assert_equal(
         OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
-        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+        numpy.zeros((nlp.vocab.vectors.shape[0],)),
     )
 
     # the loaded ngram vector table cannot be modified
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2f82a0d1b..5a0db115d 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -616,7 +616,7 @@ cdef class Doc:
         """
         if "has_vector" in self.user_hooks:
             return self.user_hooks["has_vector"](self)
-        elif self.vocab.vectors.data.size:
+        elif self.vocab.vectors.size:
             return True
         elif self.tensor.size:
             return True
@@ -641,7 +641,7 @@ cdef class Doc:
             if not len(self):
                 self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
                 return self._vector
-            elif self.vocab.vectors.data.size > 0:
+            elif self.vocab.vectors.size > 0:
                 self._vector = sum(t.vector for t in self) / len(self)
                 return self._vector
             elif self.tensor.size > 0:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index cd02cab36..9bb6bf2e7 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -497,7 +497,7 @@ cdef class Span:
         """
         if "has_vector" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["has_vector"](self)
-        elif self.vocab.vectors.data.size > 0:
+        elif self.vocab.vectors.size > 0:
             return any(token.has_vector for token in self)
         elif self.doc.tensor.size > 0:
             return True
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 084204389..b59288e38 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -164,7 +164,7 @@ def load_vectors_into_model(
         len(vectors_nlp.vocab.vectors.keys()) == 0
         and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
     ) or (
-        vectors_nlp.vocab.vectors.data.shape[0] == 0
+        vectors_nlp.vocab.vectors.shape[0] == 0
         and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
     ):
         logger.warning(Warnings.W112.format(name=name))
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 345e8df68..bc4863703 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -10,7 +10,7 @@ from typing import cast
 import warnings
 from enum import Enum
 import srsly
-from thinc.api import get_array_module, get_current_ops
+from thinc.api import Ops, get_array_module, get_current_ops
 from thinc.backends import get_array_ops
 from thinc.types import Floats2d
 
@@ -146,7 +146,7 @@ cdef class Vectors:
 
         DOCS: https://spacy.io/api/vectors#size
         """
-        return self.data.shape[0] * self.data.shape[1]
+        return self.data.size
 
     @property
     def is_full(self):
@@ -517,6 +517,9 @@ cdef class Vectors:
                     for i in range(len(queries)) ], dtype="uint64")
         return (keys, best_rows, scores)
 
+    def to_ops(self, ops: Ops):
+        self.data = ops.asarray(self.data)
+
     def _get_cfg(self):
         if self.mode == Mode.default:
             return {
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index e2e7ad1db..badd291ed 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -283,7 +283,7 @@ cdef class Vocab:
 
     @property
     def vectors_length(self):
-        return self.vectors.data.shape[1]
+        return self.vectors.shape[1]
 
     def reset_vectors(self, *, width=None, shape=None):
         """Drop the current vector table. Because all vectors must be the same
@@ -294,7 +294,7 @@ cdef class Vocab:
         elif shape is not None:
             self.vectors = Vectors(strings=self.strings, shape=shape)
         else:
-            width = width if width is not None else self.vectors.data.shape[1]
+            width = width if width is not None else self.vectors.shape[1]
             self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
 
     def prune_vectors(self, nr_row, batch_size=1024):
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index 84d2c00ad..b3bee822c 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
 | ------ | --------------------------------------- |
 | `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
 
+## Vectors.to_ops {#to_ops tag="method"}
+
+Change the embedding matrix to use different Thinc ops.
+
+> #### Example
+>
+> ```python
+> from thinc.api import NumpyOps
+>
+> vectors.to_ops(NumpyOps())
+>
+> ```
+
+| Name  | Description                                              |
+|-------|----------------------------------------------------------|
+| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
+
 ## Vectors.to_disk {#to_disk tag="method"}
 
 Save the current state to a directory.

From 2ff53834bb09eea2af3b7715a2516bcf7913a370 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 19 Jan 2022 18:45:11 +0900
Subject: [PATCH 127/133] Add link to pattern file info in
 EntityRuler.initialize docs (#10091)

* Add link to pattern file info in EntityRuler.initialize docs

* Update website/docs/api/entityruler.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/docs/api/entityruler.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 6d8f835bf..1ef283870 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -99,9 +99,9 @@ be a token pattern (list) or a phrase pattern (string). For example:
 ## EntityRuler.initialize {#initialize tag="method" new="3"}
 
 Initialize the component with data and used before training to load in rules
-from a file. This method is typically called by
-[`Language.initialize`](/api/language#initialize) and lets you customize
-arguments it receives via the
+from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method
+is typically called by [`Language.initialize`](/api/language#initialize) and
+lets you customize arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 

From 7d528e607c0c6cd267d42b2ea36e96bc25e7bd80 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jan 2022 10:53:40 +0100
Subject: [PATCH 128/133] Update quickstart install steps (#10092)

* For conda:
  * Use conda environment rather than venv
  * Install `spacy-transformers` as a conda package
* For pip:
  * Add quotes if extras are included
---
 website/src/widgets/quickstart-install.js | 42 +++++++++++++++++------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 628e1c533..1c8ad19da 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -113,8 +113,7 @@ const QuickstartInstall = ({ id, title }) => {
                             {
                                 id: 'venv',
                                 title: 'virtual env',
-                                help:
-                                    'Use a virtual environment and install spaCy into a user directory',
+                                help: 'Use a virtual environment',
                             },
                             {
                                 id: 'train',
@@ -165,27 +164,51 @@ const QuickstartInstall = ({ id, title }) => {
                         setters={setters}
                         showDropdown={showDropdown}
                     >
-                        <QS config="venv">python -m venv .env</QS>
-                        <QS config="venv" os="mac">
+                        <QS package="pip" config="venv">
+                            python -m venv .env
+                        </QS>
+                        <QS package="pip" config="venv" os="mac">
                             source .env/bin/activate
                         </QS>
-                        <QS config="venv" os="linux">
+                        <QS package="pip" config="venv" os="linux">
                             source .env/bin/activate
                         </QS>
-                        <QS config="venv" os="windows">
+                        <QS package="pip" config="venv" os="windows">
                             .env\Scripts\activate
                         </QS>
+                        <QS package="source" config="venv">
+                            python -m venv .env
+                        </QS>
+                        <QS package="source" config="venv" os="mac">
+                            source .env/bin/activate
+                        </QS>
+                        <QS package="source" config="venv" os="linux">
+                            source .env/bin/activate
+                        </QS>
+                        <QS package="source" config="venv" os="windows">
+                            .env\Scripts\activate
+                        </QS>
+                        <QS package="conda" config="venv">
+                            conda create -n venv
+                        </QS>
+                        <QS package="conda" config="venv">
+                            conda activate venv
+                        </QS>
                         <QS package="pip">pip install -U pip setuptools wheel</QS>
                         <QS package="source">pip install -U pip setuptools wheel</QS>
                         <QS package="pip">
-                            pip install -U {pkg}
-                            {pipExtras && `[${pipExtras}]`}
+                            {pipExtras
+                                ? `pip install -U '${pkg}[${pipExtras}]'`
+                                : `pip install -U ${pkg}`}
                             {nightly ? ' --pre' : ''}
                         </QS>
                         <QS package="conda">conda install -c conda-forge spacy</QS>
                         <QS package="conda" hardware="gpu">
                             conda install -c conda-forge cupy
                         </QS>
+                        <QS package="conda" config="train">
+                            conda install -c conda-forge spacy-transformers
+                        </QS>
                         <QS package="source">
                             git clone https://github.com/{repo}
                             {nightly ? ` --branch ${DEFAULT_BRANCH}` : ''}
@@ -205,9 +228,6 @@ const QuickstartInstall = ({ id, title }) => {
                         <QS config="train" package="conda" comment prompt={false}>
                             # packages only available via pip
                         </QS>
-                        <QS config="train" package="conda">
-                            pip install spacy-transformers
-                        </QS>
                         <QS config="train" package="conda">
                             pip install spacy-lookups-data
                         </QS>

From e9c631453968288f224a1ab5861bf59a9c109f63 Mon Sep 17 00:00:00 2001
From: Richard Hudson <richard@explosion.ai>
Date: Thu, 20 Jan 2022 11:40:46 +0100
Subject: [PATCH 129/133] Bugfix for similarity return types (#10051)

---
 spacy/lexeme.pyx                             |  6 ++--
 spacy/tests/vocab_vectors/test_similarity.py | 34 ++++++++++++++++----
 spacy/tokens/span.pyx                        |  6 ++--
 spacy/tokens/token.pyx                       |  6 ++--
 4 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 792e405dd..6c66effde 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -130,8 +130,10 @@ cdef class Lexeme:
             return 0.0
         vector = self.vector
         xp = get_array_module(vector)
-        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
-
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
+    
     @property
     def has_vector(self):
         """RETURNS (bool): Whether a word vector is associated with the object.
diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py
index 3b9308f4d..47cd1f060 100644
--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
     assert lex1.vector_norm != 0
     assert lex2.vector_norm != 0
     assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
+    assert isinstance(lex1.similarity(lex2), float)
     assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
     assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
 
@@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
     assert doc[0].vector_norm != 0
     assert doc[1].vector_norm != 0
     assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
+    assert isinstance(doc[0].similarity(doc[1]), float)
     assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
     assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
 
 
+def test_vectors_similarity_SS(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc = Doc(vocab, words=[word1, word2])
+    assert isinstance(doc[0:1].similarity(doc[0:2]), float)
+    assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
+
+
+def test_vectors_similarity_DD(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc1 = Doc(vocab, words=[word1, word2])
+    doc2 = Doc(vocab, words=[word2, word1])
+    assert isinstance(doc1.similarity(doc2), float)
+    assert doc1.similarity(doc2) == doc2.similarity(doc1)
+
+
 def test_vectors_similarity_TD(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     doc = Doc(vocab, words=[word1, word2])
     with pytest.warns(UserWarning):
+        assert isinstance(doc.similarity(doc[0]), float)
+        assert isinstance(doc[0].similarity(doc), float)
         assert doc.similarity(doc[0]) == doc[0].similarity(doc)
 
 
-def test_vectors_similarity_DS(vocab, vectors):
-    [(word1, vec1), (word2, vec2)] = vectors
-    doc = Doc(vocab, words=[word1, word2])
-    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
-
-
 def test_vectors_similarity_TS(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     doc = Doc(vocab, words=[word1, word2])
     with pytest.warns(UserWarning):
+        assert isinstance(doc[:2].similarity(doc[0]), float)
+        assert isinstance(doc[0].similarity(doc[-2]), float)
         assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
+
+
+def test_vectors_similarity_DS(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc = Doc(vocab, words=[word1, word2])
+    assert isinstance(doc.similarity(doc[:2]), float)
+    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 9bb6bf2e7..f7ddc5136 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -364,8 +364,10 @@ cdef class Span:
             return 0.0
         vector = self.vector
         xp = get_array_module(vector)
-        return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
-
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
+    
     cpdef np.ndarray to_array(self, object py_attr_ids):
         """Given a list of M attribute IDs, export the tokens to a numpy
         `ndarray` of shape `(N, M)`, where `N` is the length of the document.
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index aa97e2b07..c09ec28d6 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -209,8 +209,10 @@ cdef class Token:
             return 0.0
         vector = self.vector
         xp = get_array_module(vector)
-        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
-
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
+    
     def has_morph(self):
         """Check whether the token has annotated morph information.
         Return False when the morph annotation is unset/missing.

From a55212fca01f97beaf6f07e8ff3fc6e81a0b7de4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jan 2022 11:42:52 +0100
Subject: [PATCH 130/133] Determine labels by factory name in debug data
 (#10079)

* Determine labels by factory name in debug data

For all components, return labels for all components with the
corresponding factory name rather than for only the default name.

For `spancat`, return labels as a dict keyed by `spans_key`.

* Refactor for typing

* Add test

* Use assert instead of cast, removed unneeded arg

* Mark test as slow
---
 spacy/cli/debug_data.py | 38 ++++++++++++++++++++++++++++++++------
 spacy/tests/test_cli.py | 27 +++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 688b07a9b..b9831fe0c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
 from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..pipeline._parser_internals.nonproj import DELIMITER
-from ..pipeline import Morphologizer
+from ..pipeline import Morphologizer, SpanCategorizer
 from ..morphology import Morphology
 from ..language import Language
 from ..util import registry, resolve_dot_names
@@ -699,8 +699,34 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
     return count
 
 
-def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
-    if pipe_name not in nlp.pipe_names:
-        return set()
-    pipe = nlp.get_pipe(pipe_name)
-    return set(pipe.labels)
+def _get_labels_from_model(
+    nlp: Language, factory_name: str
+) -> Set[str]:
+    pipe_names = [
+        pipe_name
+        for pipe_name in nlp.pipe_names
+        if nlp.get_pipe_meta(pipe_name).factory == factory_name
+    ]
+    labels: Set[str] = set()
+    for pipe_name in pipe_names:
+        pipe = nlp.get_pipe(pipe_name)
+        labels.update(pipe.labels)
+    return labels
+
+
+def _get_labels_from_spancat(
+    nlp: Language
+) -> Dict[str, Set[str]]:
+    pipe_names = [
+        pipe_name
+        for pipe_name in nlp.pipe_names
+        if nlp.get_pipe_meta(pipe_name).factory == "spancat"
+    ]
+    labels: Dict[str, Set[str]] = {}
+    for pipe_name in pipe_names:
+        pipe = nlp.get_pipe(pipe_name)
+        assert isinstance(pipe, SpanCategorizer)
+        if pipe.key not in labels:
+            labels[pipe.key] = set()
+        labels[pipe.key].update(pipe.labels)
+    return labels
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index b0862eab6..253469909 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -12,6 +12,8 @@ from spacy.cli._util import is_subpath_of, load_project_config
 from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
+from spacy.cli.debug_data import _get_labels_from_model
+from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
 from spacy.cli.package import get_third_party_dependencies
@@ -665,3 +667,28 @@ def test_get_third_party_dependencies():
 )
 def test_is_subpath_of(parent, child, expected):
     assert is_subpath_of(parent, child) == expected
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "factory_name,pipe_name",
+    [
+        ("ner", "ner"),
+        ("ner", "my_ner"),
+        ("spancat", "spancat"),
+        ("spancat", "my_spancat"),
+    ],
+)
+def test_get_labels_from_model(factory_name, pipe_name):
+    labels = ("A", "B")
+
+    nlp = English()
+    pipe = nlp.add_pipe(factory_name, name=pipe_name)
+    for label in labels:
+        pipe.add_label(label)
+    nlp.initialize()
+    assert nlp.get_pipe(pipe_name).labels == labels
+    if factory_name == "spancat":
+        assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
+    else:
+        assert _get_labels_from_model(nlp, factory_name) == set(labels)

From 32bd3856b3b8fe749b77dca7d755366eaa87a2fd Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 20 Jan 2022 20:00:28 +0900
Subject: [PATCH 131/133] Rename FACILITY to FAC in color list (#10067)

This matches the English models
---
 spacy/displacy/render.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 14d741a3d..a032d843b 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
     "LOC": "#ff9561",
     "PERSON": "#aa9cfc",
     "NORP": "#c887fb",
-    "FACILITY": "#9cc9cc",
+    "FAC": "#9cc9cc",
     "EVENT": "#ffeb80",
     "LAW": "#ff8197",
     "LANGUAGE": "#ff8197",

From 268ddf8a0611b86ca84ddd0a36a5ead0d177d1f1 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Thu, 20 Jan 2022 13:18:39 +0100
Subject: [PATCH 132/133] Add ENT_IOB key to Matcher (#9649)

* added new field

* added exception for IOb strings

* minor refinement to schema

* removed field

* fixed typo

* imported numeriacla val

* changed the code bit

* cosmetics

* added test for matcher

* set ents of moc docs

* added invalid pattern

* minor update to documentation

* blacked matcher

* added pattern validation

* add IOB vals to schema

* changed into test

* mypy compat

* cleaned left over

* added compat import

* changed type

* added compat import

* changed literal a bit

* went back to old

* made explicit type

* Update spacy/schemas.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update spacy/schemas.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update spacy/schemas.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/matcher/matcher.pyx                     |  7 +++--
 spacy/schemas.py                              |  3 +++
 spacy/tests/matcher/test_matcher_api.py       | 27 +++++++++++++++++++
 .../tests/matcher/test_pattern_validation.py  |  1 +
 website/docs/api/matcher.md                   |  1 +
 5 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 745d7cf43..6aa58f0e3 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
 
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@@ -798,7 +798,10 @@ def _get_attr_values(spec, string_store):
                 attr = "SENT_START"
             attr = IDS.get(attr)
         if isinstance(value, str):
-            value = string_store.add(value)
+            if attr == ENT_IOB and value in Token.iob_strings():
+                value = Token.iob_strings().index(value)
+            else:
+                value = string_store.add(value)
         elif isinstance(value, bool):
             value = int(value)
         elif isinstance(value, int):
diff --git a/spacy/schemas.py b/spacy/schemas.py
index cf58688ef..1dfd8ee85 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,5 +1,6 @@
 from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
 from typing import Iterable, TypeVar, TYPE_CHECKING
+from .compat import Literal
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator, create_model
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
@@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
 UnderscoreValue = Union[
     TokenPatternString, TokenPatternNumber, str, int, float, list, bool
 ]
+IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
 
 
 class TokenPattern(BaseModel):
@@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
     lemma: Optional[StringValue] = None
     shape: Optional[StringValue] = None
     ent_type: Optional[StringValue] = None
+    ent_iob: Optional[IobValue] = None
     ent_id: Optional[StringValue] = None
     ent_kb_id: Optional[StringValue] = None
     norm: Optional[StringValue] = None
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index c02d65cdf..a27baf130 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
     matcher = Matcher(en_vocab)
     matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
     assert len(matcher(doc)) == 0
+
+
+def test_matcher_ent_iob_key(en_vocab):
+    """Test that patterns with ent_iob works correctly."""
+    matcher = Matcher(en_vocab)
+    matcher.add("Rule", [[{"ENT_IOB": "I"}]])
+    doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
+    doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
+    doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
+    doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
+    matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
+    matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
+    assert len(matches1) == 1
+    assert matches1[0] == "York"
+    assert len(matches2) == 0
+
+    matcher = Matcher(en_vocab)  # Test iob pattern with operators
+    matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
+    doc = Doc(
+        en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
+    )
+    doc.ents = [Span(doc, 4, 7, label="PERSON")]
+    matches = [doc[start:end].text for _, start, end in matcher(doc)]
+    assert len(matches) == 3
+    assert matches[0] == "Maria"
+    assert matches[1] == "Maria Esperanza"
+    assert matches[2] == "Esperanza"
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index 74feb7c5d..8c265785c 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -12,6 +12,7 @@ TEST_PATTERNS = [
     ([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
     ([{"_": "foo"}], 1, 1),
     ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
+    ([{"ENT_IOB": "foo"}], 1, 1),
     ([1, 2, 3], 3, 1),
     # Bad patterns flagged outside of Matcher
     ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0),  # prev: (1, 0)
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 803105ba2..3e7f9dc04 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -44,6 +44,7 @@ rule-based matching are:
 | `SPACY`                                         | Token has a trailing space. ~~bool~~                                                                                      |
 |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
 | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `ENT_IOB`                                       | The IOB part of the token's entity tag. ~~str~~                                                                           |
 | `ENT_ID`                                        | The token's entity ID (`ent_id`). ~~str~~                                                                                 |
 | `ENT_KB_ID`                                     | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~                                                               |
 | `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |

From 47a29168013cf077896d784344c00ac230642207 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Thu, 20 Jan 2022 13:19:38 +0100
Subject: [PATCH 133/133] Intify IOB (#9738)

* added iob to int

* added tests

* added iob strings

* added error

* blacked attrs

* Update spacy/tests/lang/test_attrs.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update spacy/attrs.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* added iob strings as global

* minor refinement with iob

* removed iob strings from token

* changed to uppercase

* cleaned and went back to master version

* imported iob from attrs

* Update and format errors

* Support and test both str and int ENT_IOB key

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/attrs.pyx                | 88 +++++++++++++++++++++++++---------
 spacy/errors.py                |  9 ++--
 spacy/tests/lang/test_attrs.py | 33 +++++++++++++
 spacy/tokens/token.pyx         |  3 +-
 4 files changed, 107 insertions(+), 26 deletions(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 640fb2f3c..dc8eed7c3 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -1,3 +1,6 @@
+from .errors import Errors
+
+IOB_STRINGS = ("", "I", "O", "B")
 
 IDS = {
     "": NULL_ATTR,
@@ -64,7 +67,6 @@ IDS = {
     "FLAG61": FLAG61,
     "FLAG62": FLAG62,
     "FLAG63": FLAG63,
-
     "ID": ID,
     "ORTH": ORTH,
     "LOWER": LOWER,
@@ -72,7 +74,6 @@ IDS = {
     "SHAPE": SHAPE,
     "PREFIX": PREFIX,
     "SUFFIX": SUFFIX,
-
     "LENGTH": LENGTH,
     "LEMMA": LEMMA,
     "POS": POS,
@@ -87,7 +88,7 @@ IDS = {
     "SPACY": SPACY,
     "LANG": LANG,
     "MORPH": MORPH,
-    "IDX": IDX
+    "IDX": IDX,
 }
 
 
@@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
     """
     inty_attrs = {}
     if _do_deprecated:
-        if 'F' in stringy_attrs:
+        if "F" in stringy_attrs:
             stringy_attrs["ORTH"] = stringy_attrs.pop("F")
-        if 'L' in stringy_attrs:
+        if "L" in stringy_attrs:
             stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
-        if 'pos' in stringy_attrs:
+        if "pos" in stringy_attrs:
             stringy_attrs["TAG"] = stringy_attrs.pop("pos")
-        if 'morph' in stringy_attrs:
-            morphs = stringy_attrs.pop('morph')
-        if 'number' in stringy_attrs:
-            stringy_attrs.pop('number')
-        if 'tenspect' in stringy_attrs:
-            stringy_attrs.pop('tenspect')
+        if "morph" in stringy_attrs:
+            morphs = stringy_attrs.pop("morph")
+        if "number" in stringy_attrs:
+            stringy_attrs.pop("number")
+        if "tenspect" in stringy_attrs:
+            stringy_attrs.pop("tenspect")
         morph_keys = [
-            'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
-            'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
-            'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
-            'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
-            'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
-            'NumValue', 'PartType', 'Polite', 'StyleVariant',
-            'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
-            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
-            'Polarity', 'PrepCase', 'Animacy' # U20
+            "PunctType",
+            "PunctSide",
+            "Other",
+            "Degree",
+            "AdvType",
+            "Number",
+            "VerbForm",
+            "PronType",
+            "Aspect",
+            "Tense",
+            "PartType",
+            "Poss",
+            "Hyph",
+            "ConjType",
+            "NumType",
+            "Foreign",
+            "VerbType",
+            "NounType",
+            "Gender",
+            "Mood",
+            "Negative",
+            "Tense",
+            "Voice",
+            "Abbr",
+            "Derivation",
+            "Echo",
+            "Foreign",
+            "NameType",
+            "NounType",
+            "NumForm",
+            "NumValue",
+            "PartType",
+            "Polite",
+            "StyleVariant",
+            "PronType",
+            "AdjType",
+            "Person",
+            "Variant",
+            "AdpType",
+            "Reflex",
+            "Negative",
+            "Mood",
+            "Aspect",
+            "Case",
+            "Polarity",
+            "PrepCase",
+            "Animacy",  # U20
         ]
         for key in morph_keys:
             if key in stringy_attrs:
@@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
     for name, value in stringy_attrs.items():
         int_key = intify_attr(name)
         if int_key is not None:
+            if int_key == ENT_IOB:
+                if value in IOB_STRINGS:
+                    value = IOB_STRINGS.index(value)
+                elif isinstance(value, str):
+                    raise ValueError(Errors.E1025.format(value=value))
             if strings_map is not None and isinstance(value, str):
-                if hasattr(strings_map, 'add'):
+                if hasattr(strings_map, "add"):
                     value = strings_map.add(value)
                 else:
                     value = strings_map[value]
diff --git a/spacy/errors.py b/spacy/errors.py
index 673674222..390612123 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -888,11 +888,14 @@ class Errors(metaclass=ErrorsWithCodes):
     E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
              "Non-UD tags should use the `tag` property.")
     E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
-    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
-    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
+    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
+             "exist.")
+    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
+             "patterns.")
+    E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
+             "supported values are: 'I', 'O', 'B' and ''")
     
 
-
 # Deprecated model shortcuts, only used in errors and warnings
 OLD_MODEL_SHORTCUTS = {
     "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py
index 5350c1fe5..1c27c1744 100644
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@@ -1,4 +1,5 @@
 import pytest
+from spacy.attrs import intify_attrs, ENT_IOB
 
 from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
 from spacy.lang.en.stop_words import STOP_WORDS
@@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
     assert int_attrs == {ORTH: 10, IS_ALPHA: True}
 
 
+def test_attrs_ent_iob_intify():
+    int_attrs = intify_attrs({"ENT_IOB": ""})
+    assert int_attrs == {ENT_IOB: 0}
+
+    int_attrs = intify_attrs({"ENT_IOB": "I"})
+    assert int_attrs == {ENT_IOB: 1}
+
+    int_attrs = intify_attrs({"ENT_IOB": "O"})
+    assert int_attrs == {ENT_IOB: 2}
+
+    int_attrs = intify_attrs({"ENT_IOB": "B"})
+    assert int_attrs == {ENT_IOB: 3}
+
+    int_attrs = intify_attrs({ENT_IOB: ""})
+    assert int_attrs == {ENT_IOB: 0}
+
+    int_attrs = intify_attrs({ENT_IOB: "I"})
+    assert int_attrs == {ENT_IOB: 1}
+
+    int_attrs = intify_attrs({ENT_IOB: "O"})
+    assert int_attrs == {ENT_IOB: 2}
+
+    int_attrs = intify_attrs({ENT_IOB: "B"})
+    assert int_attrs == {ENT_IOB: 3}
+
+    with pytest.raises(ValueError):
+        int_attrs = intify_attrs({"ENT_IOB": "XX"})
+
+    with pytest.raises(ValueError):
+        int_attrs = intify_attrs({ENT_IOB: "XX"})
+
+
 @pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
 def test_lex_attrs_is_punct(text, match):
     assert is_punct(text) == match
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index c09ec28d6..b515ab67b 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
 
 from .. import parts_of_speech
 from ..errors import Errors, Warnings
+from ..attrs import IOB_STRINGS
 from .underscore import Underscore, get_ext_args
 
 
@@ -745,7 +746,7 @@ cdef class Token:
 
     @classmethod
     def iob_strings(cls):
-        return ("", "I", "O", "B")
+        return IOB_STRINGS
 
     @property
     def ent_iob_(self):