From a1f25412da871e14a60033c80616f139bd537b7d Mon Sep 17 00:00:00 2001
From: Richard Hudson <richard@explosion.ai>
Date: Mon, 22 Nov 2021 09:46:34 +0100
Subject: [PATCH 001/123] Edited Slovenian stop words list (#9707)

---
 spacy/lang/sl/stop_words.py | 130 +-----------------------------------
 1 file changed, 1 insertion(+), 129 deletions(-)

diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py
index 6fb01a183..c9004ed5d 100644
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@@ -1,13 +1,10 @@
 # Source: https://github.com/stopwords-iso/stopwords-sl
-# TODO: probably needs to be tidied up – the list seems to have month names in
-# it, which shouldn't be considered stop words.
+# Removed various words that are not normally considered stop words, such as months.
 
 STOP_WORDS = set(
     """
 a
 ali
-april
-avgust
 b
 bi
 bil
@@ -19,7 +16,6 @@ biti
 blizu
 bo
 bodo
-bojo
 bolj
 bom
 bomo
@@ -37,16 +33,6 @@ da
 daleč
 dan
 danes
-datum
-december
-deset
-deseta
-deseti
-deseto
-devet
-deveta
-deveti
-deveto
 do
 dober
 dobra
@@ -54,16 +40,7 @@ dobri
 dobro
 dokler
 dol
-dolg
-dolga
-dolgi
 dovolj
-drug
-druga
-drugi
-drugo
-dva
-dve
 e
 eden
 en
@@ -74,7 +51,6 @@ enkrat
 eno
 etc.
 f
-februar
 g
 g.
 ga
@@ -93,16 +69,12 @@ iv
 ix
 iz
 j
-januar
 jaz
 je
 ji
 jih
 jim
 jo
-julij
-junij
-jutri
 k
 kadarkoli
 kaj
@@ -123,41 +95,23 @@ kje
 kjer
 kjerkoli
 ko
-koder
 koderkoli
 koga
 komu
 kot
-kratek
-kratka
-kratke
-kratki
 l
-lahka
-lahke
-lahki
-lahko
 le
 lep
 lepa
 lepe
 lepi
 lepo
-leto
 m
-maj
-majhen
-majhna
-majhni
-malce
-malo
 manj
-marec
 me
 med
 medtem
 mene
-mesec
 mi
 midva
 midve
@@ -183,7 +137,6 @@ najmanj
 naju
 največ
 nam
-narobe
 nas
 nato
 nazaj
@@ -192,7 +145,6 @@ naša
 naše
 ne
 nedavno
-nedelja
 nek
 neka
 nekaj
@@ -236,7 +188,6 @@ njuna
 njuno
 no
 nocoj
-november
 npr.
 o
 ob
@@ -244,51 +195,23 @@ oba
 obe
 oboje
 od
-odprt
-odprta
-odprti
 okoli
-oktober
 on
 onadva
 one
 oni
 onidve
-osem
-osma
-osmi
-osmo
 oz.
 p
 pa
-pet
-peta
-petek
-peti
-peto
 po
 pod
 pogosto
 poleg
-poln
-polna
-polni
-polno
 ponavadi
-ponedeljek
 ponovno
 potem
 povsod
-pozdravljen
-pozdravljeni
-prav
-prava
-prave
-pravi
-pravo
-prazen
-prazna
-prazno
 prbl.
 precej
 pred
@@ -297,19 +220,10 @@ preko
 pri
 pribl.
 približno
-primer
-pripravljen
-pripravljena
-pripravljeni
 proti
-prva
-prvi
-prvo
 r
-ravno
 redko
 res
-reč
 s
 saj
 sam
@@ -321,29 +235,17 @@ se
 sebe
 sebi
 sedaj
-sedem
-sedma
-sedmi
-sedmo
 sem
-september
 seveda
 si
 sicer
 skoraj
 skozi
-slab
 smo
 so
-sobota
 spet
-sreda
-srednja
-srednji
 sta
 ste
-stran
-stvar
 sva
 t
 ta
@@ -358,10 +260,6 @@ te
 tebe
 tebi
 tega
-težak
-težka
-težki
-težko
 ti
 tista
 tiste
@@ -371,11 +269,6 @@ tj.
 tja
 to
 toda
-torek
-tretja
-tretje
-tretji
-tri
 tu
 tudi
 tukaj
@@ -392,10 +285,6 @@ vaša
 vaše
 ve
 vedno
-velik
-velika
-veliki
-veliko
 vendar
 ves
 več
@@ -403,10 +292,6 @@ vi
 vidva
 vii
 viii
-visok
-visoka
-visoke
-visoki
 vsa
 vsaj
 vsak
@@ -420,34 +305,21 @@ vsega
 vsi
 vso
 včasih
-včeraj
 x
 z
 za
 zadaj
 zadnji
 zakaj
-zaprta
-zaprti
-zaprto
 zdaj
 zelo
 zunaj
 č
 če
 često
-četrta
-četrtek
-četrti
-četrto
 čez
 čigav
 š
-šest
-šesta
-šesti
-šesto
-štiri
 ž
 že
 """.split()

From 25bd9f9d4876ed966142b01e3d37dd51c8a7c594 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Tue, 23 Nov 2021 16:29:25 +0100
Subject: [PATCH 002/123] Noun chunks for Italian (#9662)

* added it vocab

* copied portuguese

* added possessive determiner

* added conjed Nps

* added nmoded Nps

* test misc

* more examples

* fixed typo

* fixed parenth

* fixed comma

* comma fix

* added syntax iters

* fix some index problems

* fixed index

* corrected heads for test case

* fixed tets case

* fixed determiner gender

* cleaned left over

* added example with apostophe
---
 spacy/lang/it/__init__.py               |   4 +-
 spacy/lang/it/syntax_iterators.py       |  86 +++++++++
 spacy/tests/conftest.py                 |   5 +
 spacy/tests/lang/it/test_noun_chunks.py | 221 ++++++++++++++++++++++++
 4 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 spacy/lang/it/syntax_iterators.py
 create mode 100644 spacy/tests/lang/it/test_noun_chunks.py

diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index 1edebc837..ecf322bd7 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults
 from .lemmatizer import ItalianLemmatizer
+from .syntax_iterators import SYNTAX_ITERATORS
 
 
 class ItalianDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-    stop_words = STOP_WORDS
     prefixes = TOKENIZER_PREFIXES
     infixes = TOKENIZER_INFIXES
+    stop_words = STOP_WORDS
+    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Italian(Language):
diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py
new file mode 100644
index 000000000..f63df3fad
--- /dev/null
+++ b/spacy/lang/it/syntax_iterators.py
@@ -0,0 +1,86 @@
+from typing import Union, Iterator, Tuple
+
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "obl:agent",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
+    dets = ["det", "det:poss"]
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
+    np_label = doc.vocab.strings.add("NP")
+    adj_label = doc.vocab.strings.add("amod")
+    det_labels = {doc.vocab.strings.add(det) for det in dets}
+    det_pos = doc.vocab.strings.add("DET")
+    adp_label = doc.vocab.strings.add("ADP")
+    conj = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
+        if word.dep in np_deps:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
+
+            if right_child:
+                if (
+                    right_child.dep == adj_label
+                ):  # allow chain of adjectives by expanding to right
+                    right_end = right_child.right_edge
+                elif (
+                    right_child.dep in det_labels and right_child.pos == det_pos
+                ):  # cut relative pronouns here
+                    right_end = right_child
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
+            else:
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_label else left_index
+            )
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 88c7adfe3..2e75f9964 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -190,6 +190,11 @@ def it_tokenizer():
     return get_lang_class("it")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def it_vocab():
+    return get_lang_class("it")().vocab
+
+
 @pytest.fixture(scope="session")
 def ja_tokenizer():
     pytest.importorskip("sudachipy")
diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py
new file mode 100644
index 000000000..0a8c10e79
--- /dev/null
+++ b/spacy/tests/lang/it/test_noun_chunks.py
@@ -0,0 +1,221 @@
+from spacy.tokens import Doc
+import pytest
+
+
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # determiner + noun
+        # un pollo -> un pollo
+        (
+            ["un", "pollo"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0,2)],
+        ),
+        # two determiners + noun
+        # il mio cane -> il mio cane
+        (
+            ["il", "mio", "cane"],
+            [2, 2, 2],
+            ["det", "det:poss", "ROOT"],
+            ["DET", "DET", "NOUN"],
+            [(0,3)],
+        ),
+        # two determiners, one is after noun. rare usage but still testing
+        # il cane mio-> il cane mio
+        (
+            ["il", "cane", "mio"],
+            [1, 1, 1],
+            ["det", "ROOT", "det:poss"],
+            ["DET", "NOUN", "DET"],
+            [(0,3)],
+        ),
+        # relative pronoun
+        # È molto bello il vestito che hai acquistat -> il vestito, che   the dress that you bought is very pretty.
+        (
+            ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
+            [2, 2, 2, 4, 2, 7, 7, 4],
+            ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
+            ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
+            [(3,5), (5,6)]
+        ),
+        # relative subclause
+        # il computer che hai comprato -> il computer, che     the computer that you bought
+        (
+            ['il', 'computer', 'che', 'hai', 'comprato'],
+            [1, 1, 4, 4, 1],
+            ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
+            ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
+            [(0,2), (2,3)]
+        ),
+        # det + noun + adj
+        # Una macchina grande  -> Una macchina grande
+        (
+            ["Una", "macchina", "grande"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0,3)],
+        ),
+        # noun + adj plural
+        # mucche bianche 
+        (
+            ["mucche", "bianche"],
+            [0, 0],
+            ["ROOT", "amod"],
+            ["NOUN", "ADJ"],
+            [(0,2)],
+        ),
+        # det + adj + noun
+        # Una grande macchina -> Una grande macchina
+        (
+            ['Una', 'grande', 'macchina'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # det + adj + noun, det with apostrophe
+        # un'importante associazione -> un'importante associazione
+        (
+            ["Un'", 'importante', 'associazione'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # multiple adjectives
+        # Un cane piccolo e marrone -> Un cane piccolo e marrone
+        (
+            ["Un", "cane", "piccolo", "e", "marrone"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # determiner, adjective, compound created by flat
+        # le Nazioni Unite -> le Nazioni Unite
+        (
+            ["le", "Nazioni", "Unite"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat:name"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # one determiner + one noun + one adjective qualified by an adverb
+        # alcuni contadini molto ricchi -> alcuni contadini molto ricchi     some very rich farmers
+        (
+            ['alcuni', 'contadini', 'molto', 'ricchi'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'advmod', 'amod'],
+            ['DET', 'NOUN', 'ADV', 'ADJ'],
+            [(0,4)]
+        ),
+        # Two NPs conjuncted
+        # Ho un cane e un gatto -> un cane, un gatto
+        ( 
+            ['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
+            [0, 2, 0, 5, 5, 0],
+            ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+            ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+            [(1,3), (4,6)]
+         
+        ),
+        # Two NPs together
+        # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
+        (
+            ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
+            [1, 1, 1, 1, 3],
+            ['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
+            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+            [(0, 3), (3, 5)]
+        ),
+        # Noun compound, person name and titles
+        # Dom Pedro II -> Dom Pedro II
+        (
+            ["Dom", "Pedro", "II"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Noun compound created by flat
+        # gli Stati Uniti
+        (
+            ["gli", "Stati", "Uniti"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat:name"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # nmod relation between NPs
+        # la distruzione della città -> la distruzione, città
+        (
+            ['la', 'distruzione', 'della', 'città'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'case', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'NOUN'],
+            [(0,2), (3,4)]
+        ),
+        # Compounding by nmod, several NPs chained together
+        # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
+        (
+            ["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
+            [2, 2, 2, 4, 2, 6, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(0, 3), (4, 5), (6, 7)]
+        ),
+        # several NPs
+        # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
+        (
+            ['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
+            [0, 2, 0, 4, 2],
+            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3), (4,5)]  
+       
+        ),
+        # Several NPs
+        # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
+        (  
+            ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
+            [1, 1, 1, 4, 1, 8, 8, 8, 1],
+            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
+            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,9)]
+        ),
+        # Passive subject
+        # La nuova spesa è alimentata dal grande conto in banca di Clinton  -> Le nuova spesa, grande conto, banca, Clinton
+        (
+            ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
+            [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
+            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0, 3), (6, 8), (9, 10), (11,12)]
+        ),
+        # Misc
+        # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
+        (
+            ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
+            [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
+            ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
+            ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(2,4), (9,12), (13,14), (17,18), (19,20)]
+        )
+    ],
+)
+# fmt: on
+def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
+def test_noun_chunks_is_parsed_it(it_tokenizer):
+    """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
+    doc = it_tokenizer("Sei andato a Oxford")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)

From 29f28d1f3e9d6a6ab19dc6edb7247c0d3f22df98 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Tue, 30 Nov 2021 12:19:07 +0100
Subject: [PATCH 003/123] French NP review (#9667)

* adapted from pt

* added basic tests

* added fr vocab

* fixed noun chunks

* more examples

* typo fix

* changed naming

* changed the naming

* typo fix
---
 spacy/lang/fr/syntax_iterators.py       |  72 ++++++--
 spacy/tests/conftest.py                 |   5 +
 spacy/tests/lang/fr/test_noun_chunks.py | 224 +++++++++++++++++++++++-
 3 files changed, 288 insertions(+), 13 deletions(-)

diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index d86662693..5f7ba5c10 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -6,16 +6,35 @@ from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
-    # fmt: off
-    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
-    # fmt: on
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "obl:agent",
+        "obl:arg",
+        "obl:mod",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
     doc = doclike.doc  # Ensure works on both Doc and Span.
     if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
-    np_deps = [doc.vocab.strings[label] for label in labels]
-    conj = doc.vocab.strings.add("conj")
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
     np_label = doc.vocab.strings.add("NP")
+    adj_label = doc.vocab.strings.add("amod")
+    det_label = doc.vocab.strings.add("det")
+    det_pos = doc.vocab.strings.add("DET")
+    adp_pos = doc.vocab.strings.add("ADP")
+    conj_label = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
     prev_end = -1
     for i, word in enumerate(doclike):
         if word.pos not in (NOUN, PROPN, PRON):
@@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
         if word.left_edge.i <= prev_end:
             continue
         if word.dep in np_deps:
-            prev_end = word.right_edge.i
-            yield word.left_edge.i, word.right_edge.i + 1, np_label
-        elif word.dep == conj:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
+
+            if right_child:
+                if (
+                    right_child.dep == adj_label
+                ):  # allow chain of adjectives by expanding to right
+                    right_end = right_child.right_edge
+                elif (
+                    right_child.dep == det_label and right_child.pos == det_pos
+                ):  # cut relative pronouns here
+                    right_end = right_child
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
+            else:
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_pos else left_index
+            )
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj_label:
             head = word.head
-            while head.dep == conj and head.head.i < head.i:
+            while head.dep == conj_label and head.head.i < head.i:
                 head = head.head
             # If the head is an NP, and we're coordinated to it, we're an NP
             if head.dep in np_deps:
-                prev_end = word.right_edge.i
-                yield word.left_edge.i, word.right_edge.i + 1, np_label
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
 
 
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 2e75f9964..002a8f027 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -145,6 +145,11 @@ def fr_tokenizer():
     return get_lang_class("fr")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def fr_vocab():
+    return get_lang_class("fr")().vocab
+
+
 @pytest.fixture(scope="session")
 def ga_tokenizer():
     return get_lang_class("ga")().tokenizer
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index 48ac88ead..25b95f566 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -1,8 +1,230 @@
+from spacy.tokens import Doc
 import pytest
 
 
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # determiner + noun
+        # un nom -> un nom
+        (
+            ["un", "nom"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # determiner + noun starting with vowel
+        # l'heure -> l'heure
+        (
+            ["l'", "heure"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # determiner + plural noun
+        # les romans -> les romans
+        (
+            ["les", "romans"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # det + adj + noun
+        # Le vieux Londres  -> Le vieux Londres 
+        (
+            ['Les', 'vieux', 'Londres'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # det + noun + adj
+        # le nom propre  -> le nom propre   a proper noun
+        (
+            ["le", "nom", "propre"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # det + noun + adj plural
+        # Les chiens bruns  -> les chiens bruns
+        (
+            ["Les", "chiens", "bruns"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # multiple adjectives: one adj before the noun, one adj after the noun
+        # un nouveau film intéressant -> un nouveau film intéressant
+        (
+            ["un", "nouveau", "film", "intéressant"],
+            [2, 2, 2, 2],
+            ["det", "amod", "ROOT", "amod"],
+            ["DET", "ADJ", "NOUN", "ADJ"],
+            [(0,4)]
+        ),
+        # multiple adjectives, both adjs after the noun
+        # une personne intelligente et drôle -> une personne intelligente et drôle
+        (
+            ["une", "personne", "intelligente", "et", "drôle"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # relative pronoun
+        # un bus qui va au ville -> un bus, qui, ville
+        (
+            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
+            [1, 1, 3, 1, 5, 3],
+            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
+            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
+            [(0,2), (2,3), (5,6)]
+        ),
+        # relative subclause
+        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
+        (
+            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
+            [0, 2, 0, 5, 5, 2, 5],
+            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
+            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
+            [(1,3), (4,5)]
+        ),
+        # Person name and title by flat
+        # Louis XIV -> Louis XIV
+        (
+            ["Louis", "XIV"],
+            [0, 0],
+            ["ROOT", "flat:name"],
+            ["PROPN", "PROPN"],
+            [(0,2)]
+        ),
+        # Organization name by flat
+        # Nations Unies -> Nations Unies
+        (
+            ["Nations", "Unies"],
+            [0, 0],
+            ["ROOT", "flat:name"],
+            ["PROPN", "PROPN"],
+            [(0,2)]
+        ),
+        # Noun compound, person name created by two flats
+        # Louise de Bratagne -> Louise de Bratagne
+        (
+            ["Louise", "de", "Bratagne"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Noun compound, person name created by two flats
+        # Louis François Joseph -> Louis François Joseph
+        (
+            ["Louis", "François", "Joseph"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # one determiner + one noun + one adjective qualified by an adverb
+        # quelques agriculteurs très riches -> quelques agriculteurs très riches
+        (
+            ["quelques", "agriculteurs", "très", "riches"],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'advmod', 'amod'],
+            ['DET', 'NOUN', 'ADV', 'ADJ'],
+            [(0,4)]
+        ),
+        # Two NPs conjuncted
+        # Il a un chien et un chat -> Il, un chien, un chat
+        ( 
+            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
+            [1, 1, 3, 1, 6, 6, 3],
+            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,1), (2,4), (5,7)]
+         
+        ),
+        # Two NPs together
+        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
+        (
+            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
+            [1, 1, 1, 1, 3],
+            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
+            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+            [(0, 3), (3, 5)]
+        ),
+        # nmod relation between NPs
+        # la destruction de la ville -> la destruction, la ville
+        (
+            ['la', 'destruction', 'de', 'la', 'ville'],
+            [1, 1, 4, 4, 1],
+            ['det', 'ROOT', 'case', 'det', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
+            [(0,2), (3,5)]
+        ),
+        # nmod relation between NPs
+        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
+        (
+            ['Archiduchesse', 'd’', 'Autriche'],
+            [0, 2, 0],
+            ['ROOT', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3)]
+        ),
+        # Compounding by nmod, several NPs chained together
+        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
+        (
+            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
+            [2, 2, 2, 4, 2, 6, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(0, 3), (4, 5), (6, 7)]
+        ),
+        # several NPs
+        # Traduction du rapport de Susana -> Traduction, rapport, Susana
+        (
+            ['Traduction', 'du', 'raport', 'de', 'Susana'],
+            [0, 2, 0, 4, 2],
+            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3), (4,5)]  
+       
+        ),
+        # Several NPs
+        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
+        (  
+            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
+            [2, 2, 2, 4, 2, 7, 7, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,8)]
+        ),
+        # Passive subject
+        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
+        (
+            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
+            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
+            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
+            [(0, 3), (6, 10), (11, 12)]
+        )
+    ],
+)
+# fmt: on
+def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
     """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
-    doc = fr_tokenizer("trouver des travaux antérieurs")
+    doc = fr_tokenizer("Je suis allé à l'école")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)

From b4d526c357a606775e870c2dbe2a794140517d5d Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 30 Nov 2021 22:36:39 +0000
Subject: [PATCH 004/123] Add Japanese kana characters to default exceptions
 (fix #9693) (#9742)

This includes the main kana, or phonetic characters, used in Japanese.

There are some supplemental kana blocks in Unicode outside the BMP that
could also be included, but because their actual use is rare I omitted
them for now, but maybe they should be added. The omitted blocks are:

- Kana Supplement
- Kana Extended (A and B)
- Small Kana Extension
---
 spacy/lang/char_classes.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 9e5441a4f..b15bb3cf3 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
 _hangul_jamo = r"\u1100-\u11FF"
 _hangul = _hangul_syllables + _hangul_jamo
 
+_hiragana = r"\u3040-\u309F"
+_katakana = r"\u30A0-\u30FFー"
+_kana = _hiragana + _katakana
+
 # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
 _latin_u_extendedA = (
     r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@@ -244,6 +248,7 @@ _uncased = (
     + _tamil
     + _telugu
     + _hangul
+    + _kana
     + _cjk
 )
 

From 251119455de70957088970ca0aa56624789ea65c Mon Sep 17 00:00:00 2001
From: Haakon Meland Eriksen <haakon.eriksen@far.no>
Date: Tue, 7 Dec 2021 09:45:10 +0100
Subject: [PATCH 005/123] Remove NER words from stop words in Norwegian (#9820)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Default stop words in Norwegian bokmål (nb) in Spacy contain important entities, e.g. France, Germany, Russia, Sweden and USA, police district, important units of time, e.g. months and days of the week, and organisations.

Nobody expects their presence among the default stop words. There is a danger of users complying with the general recommendation of filtering out stop words, while being unaware of filtering out important entities from their data.

See explanation in https://github.com/explosion/spaCy/issues/3052#issuecomment-986756711 and comment https://github.com/explosion/spaCy/issues/3052#issuecomment-986951831
---
 spacy/lang/nb/stop_words.py | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py
index fd65dd788..d9ed414ef 100644
--- a/spacy/lang/nb/stop_words.py
+++ b/spacy/lang/nb/stop_words.py
@@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av
 
 bak bare bedre beste blant ble bli blir blitt bris by både
 
-da dag de del dem den denne der dermed det dette disse drept du
+da dag de del dem den denne der dermed det dette disse du
 
 eller en enn er et ett etter
 
-fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
+fem fikk fire fjor flere folk for fortsatt fra fram
 funnet få får fått før først første
 
 gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
 
-ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
-hvorfor
+ha hadde ham han hans har hele helt henne hennes her hun
 
 i ifølge igjen ikke ingen inn
 
 ja jeg
 
 kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
-kvinner
 
-la laget land landet langt leder ligger like litt løpet lørdag
+la laget land landet langt leder ligger like litt løpet
 
-man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
-millioner minutter mot msci mye må mål måtte
+man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte
 
-ned neste noe noen nok norge norsk norske ntb ny nye nå når
+ned neste noe noen nok ny nye nå når
 
-og også om onsdag opp opplyser oslo oss over
+og også om opp opplyser oss over
 
-personer plass poeng politidistrikt politiet president prosent på
+personer plass poeng på
 
-regjeringen runde rundt russland
+runde rundt
 
-sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
+sa saken samme sammen samtidig satt se seg seks selv senere ser sett
 siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
-store står sverige svært så søndag
+store står svært så
 
-ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
-tyskland
+ta tatt tid tidligere til tilbake tillegg tok tror
 
-under usa ut uten utenfor
+under ut uten utenfor
 
 vant var ved veldig vi videre viktig vil ville viser vår være vært
 

From 3cfeb518ee5a54742366ea5ad60ead420dcd8e3d Mon Sep 17 00:00:00 2001
From: Andrew Janco <apjanco@gmail.com>
Date: Tue, 21 Dec 2021 09:46:33 -0500
Subject: [PATCH 006/123] Handle "_" value for token pos in conllu data (#9903)

* change '_' to '' to allow Token.pos, when no value for token pos in conllu data

* Minor code style

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/training/converters/conllu_to_docs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py
index 66156b6e5..7a4f44d3b 100644
--- a/spacy/training/converters/conllu_to_docs.py
+++ b/spacy/training/converters/conllu_to_docs.py
@@ -188,6 +188,7 @@ def conllu_sentence_to_doc(
         id_ = int(id_) - 1
         head = (int(head) - 1) if head not in ("0", "_") else id_
         tag = pos if tag == "_" else tag
+        pos = pos if pos != "_" else ""
         morph = morph if morph != "_" else ""
         dep = "ROOT" if dep == "root" else dep
         lemmas.append(lemma)

From 7ec1452f5fe2aea2aa74c4910a9a7903d979fb66 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Thu, 23 Dec 2021 13:41:01 +0100
Subject: [PATCH 007/123] added ellided forms (#9878)

* added ellided forms

* rearranged a bit

* rearranged a bit

* added stopword tests

* blacked tests file
---
 spacy/lang/it/stop_words.py           | 30 +++++++++++++--------------
 spacy/tests/lang/it/test_stopwords.py | 17 +++++++++++++++
 2 files changed, 32 insertions(+), 15 deletions(-)
 create mode 100644 spacy/tests/lang/it/test_stopwords.py

diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py
index 4178ed452..42adc7904 100644
--- a/spacy/lang/it/stop_words.py
+++ b/spacy/lang/it/stop_words.py
@@ -10,18 +10,18 @@ avresti avrete avrà avrò avuta avute avuti avuto
 
 basta bene benissimo brava bravo
 
-casa caso cento certa certe certi certo che chi chicchessia chiunque ci
+casa caso cento certa certe certi certo che chi chicchessia chiunque ci c'
 ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto
 cogli coi col colei coll coloro colui come cominci comunque con concernente
 conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui
 
-da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli
-dei del dell della delle dello dentro detto deve di dice dietro dire
+d' da dagl dagli dai dal dall dall' dalla dalle dallo dappertutto davanti degl degli
+dei del dell dell' della delle dello dentro detto deve di dice dietro dire
 dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due
 dunque durante
 
-ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
-erano eravamo eravate eri ero esempio esse essendo esser essere essi ex
+e ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
+erano eravamo eravate eri ero esempio esse essendo esser essere essi ex è
 
 fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero
 facessi facessimo faceste facesti faceva facevamo facevano facevate facevi
@@ -30,21 +30,21 @@ fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente
 finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra
 frattempo fu fui fummo fuori furono futuro generale
 
-gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo
+gia già giacche giorni giorno gli gl' gliela gliele glieli glielo gliene governo
 grande grazie gruppo
 
 ha haha hai hanno ho
 
 ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io
 
-la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
+l' la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
 
-ma macche magari maggior mai male malgrado malissimo mancanza marche me
+m' ma macche magari maggior mai male malgrado malissimo mancanza marche me
 medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi
 milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto
 
-nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun
-nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre
+nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun nessun'
+nessuna nessuno nient' niente no noi non nondimeno nonostante nonsia nostra nostre
 nostri nostro novanta nove nulla nuovo
 
 od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto
@@ -56,12 +56,12 @@ potrebbe preferibilmente presa press prima primo principalmente probabilmente
 proprio puo può pure purtroppo
 
 qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante
-quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest
+quanti quanto quantunque quasi quattro quel quel' quella quelle quelli quello quest quest'
 questa queste questi questo qui quindi
 
 realmente recente recentemente registrazione relativo riecco salvo
 
-sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
+s' sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
 saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei
 sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate
 siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando
@@ -72,12 +72,12 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
 subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
 sullo suo suoi
 
-tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
+t' tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
 troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
 
-uguali ulteriore ultimo un una uno uomo
+uguali ulteriore ultimo un un' una uno uomo
 
-va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
+v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
 vostra vostre vostri vostro
 """.split()
 )
diff --git a/spacy/tests/lang/it/test_stopwords.py b/spacy/tests/lang/it/test_stopwords.py
new file mode 100644
index 000000000..954913164
--- /dev/null
+++ b/spacy/tests/lang/it/test_stopwords.py
@@ -0,0 +1,17 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "word", ["un", "lo", "dell", "dall", "si", "ti", "mi", "quest", "quel", "quello"]
+)
+def test_stopwords_basic(it_tokenizer, word):
+    tok = it_tokenizer(word)[0]
+    assert tok.is_stop
+
+
+@pytest.mark.parametrize(
+    "word", ["quest'uomo", "l'ho", "un'amica", "dell'olio", "s'arrende", "m'ascolti"]
+)
+def test_stopwords_elided(it_tokenizer, word):
+    tok = it_tokenizer(word)[0]
+    assert tok.is_stop

From 86e71e7b19a70da7139b33b88bc4ce89e9142f47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Florian=20C=C3=A4sar?= <florian.caesar@protonmail.com>
Date: Wed, 29 Dec 2021 11:04:39 +0100
Subject: [PATCH 008/123] Fix Scorer.score_cats for missing labels (#9443)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix Scorer.score_cats for missing labels

* Add test case for Scorer.score_cats missing labels

* semantic nitpick

* black formatting

* adjust test to give different results depending on multi_label setting

* fix loss function according to whether or not missing values are supported

* add note to docs

* small fixes

* make mypy happy

* Update spacy/pipeline/textcat.py

Co-authored-by: Florian Cäsar <florian.caesar@pm.me>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/pipeline/senter.pyx            |  2 +-
 spacy/pipeline/spancat.py            |  3 +-
 spacy/pipeline/textcat.py            | 11 ++++-
 spacy/pipeline/textcat_multilabel.py | 10 +++--
 spacy/scorer.py                      | 24 +++++-----
 spacy/tests/pipeline/test_textcat.py | 66 ++++++++++++++++++++++++++++
 website/docs/api/textcategorizer.md  |  6 ++-
 7 files changed, 103 insertions(+), 19 deletions(-)

diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 54ce021af..2e0f364f0 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,6 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
-from itertools import islice
 from typing import Optional, Callable
+from itertools import islice
 
 import srsly
 from thinc.api import Model, SequenceCategoricalCrossentropy, Config
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 829def1eb..01c9c407f 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,9 +1,10 @@
-import numpy
 from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
 
+import numpy
+
 from ..compat import Protocol, runtime_checkable
 from ..scorer import Scorer
 from ..language import Language
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 30a65ec52..e20ae87f1 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,8 +1,8 @@
-from itertools import islice
 from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any
 from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
 from thinc.types import Floats2d
 import numpy
+from itertools import islice
 
 from .trainable_pipe import TrainablePipe
 from ..language import Language
@@ -158,6 +158,13 @@ class TextCategorizer(TrainablePipe):
         self.cfg = dict(cfg)
         self.scorer = scorer
 
+    @property
+    def support_missing_values(self):
+        # There are no missing values as the textcat should always
+        # predict exactly one label. All other labels are 0.0
+        # Subclasses may override this property to change internal behaviour.
+        return False
+
     @property
     def labels(self) -> Tuple[str]:
         """RETURNS (Tuple[str]): The labels currently added to the component.
@@ -294,7 +301,7 @@ class TextCategorizer(TrainablePipe):
             for j, label in enumerate(self.labels):
                 if label in eg.reference.cats:
                     truths[i, j] = eg.reference.cats[label]
-                else:
+                elif self.support_missing_values:
                     not_missing[i, j] = 0.0
         truths = self.model.ops.asarray(truths)  # type: ignore
         return truths, not_missing  # type: ignore
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index a7bfacca7..e33a885f8 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,8 +1,8 @@
-from itertools import islice
 from typing import Iterable, Optional, Dict, List, Callable, Any
-
-from thinc.api import Model, Config
 from thinc.types import Floats2d
+from thinc.api import Model, Config
+
+from itertools import islice
 
 from ..language import Language
 from ..training import Example, validate_get_examples
@@ -158,6 +158,10 @@ class MultiLabel_TextCategorizer(TextCategorizer):
         self.cfg = dict(cfg)
         self.scorer = scorer
 
+    @property
+    def support_missing_values(self):
+        return True
+
     def initialize(  # type: ignore[override]
         self,
         get_examples: Callable[[], Iterable[Example]],
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 4d596b5e1..ae9338bd5 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -445,7 +445,8 @@ class Scorer:
             getter(doc, attr) should return the values for the individual doc.
         labels (Iterable[str]): The set of possible labels. Defaults to [].
         multi_label (bool): Whether the attribute allows multiple labels.
-            Defaults to True.
+            Defaults to True. When set to False (exclusive labels), missing
+            gold labels are interpreted as 0.0.
         positive_label (str): The positive label for a binary task with
             exclusive classes. Defaults to None.
         threshold (float): Cutoff to consider a prediction "positive". Defaults
@@ -484,13 +485,15 @@ class Scorer:
 
             for label in labels:
                 pred_score = pred_cats.get(label, 0.0)
-                gold_score = gold_cats.get(label, 0.0)
+                gold_score = gold_cats.get(label)
+                if not gold_score and not multi_label:
+                    gold_score = 0.0
                 if gold_score is not None:
                     auc_per_type[label].score_set(pred_score, gold_score)
             if multi_label:
                 for label in labels:
                     pred_score = pred_cats.get(label, 0.0)
-                    gold_score = gold_cats.get(label, 0.0)
+                    gold_score = gold_cats.get(label)
                     if gold_score is not None:
                         if pred_score >= threshold and gold_score > 0:
                             f_per_type[label].tp += 1
@@ -502,16 +505,15 @@ class Scorer:
                 # Get the highest-scoring for each.
                 pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
                 gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
-                if gold_score is not None:
-                    if pred_label == gold_label and pred_score >= threshold:
-                        f_per_type[pred_label].tp += 1
-                    else:
-                        f_per_type[gold_label].fn += 1
-                        if pred_score >= threshold:
-                            f_per_type[pred_label].fp += 1
+                if pred_label == gold_label and pred_score >= threshold:
+                    f_per_type[pred_label].tp += 1
+                else:
+                    f_per_type[gold_label].fn += 1
+                    if pred_score >= threshold:
+                        f_per_type[pred_label].fp += 1
             elif gold_cats:
                 gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
-                if gold_score is not None and gold_score > 0:
+                if gold_score > 0:
                     f_per_type[gold_label].fn += 1
             elif pred_cats:
                 pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 282789f2b..52bf6ec5c 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -725,6 +725,72 @@ def test_textcat_evaluation():
     assert scores["cats_micro_r"] == 4 / 6
 
 
+@pytest.mark.parametrize(
+    "multi_label,spring_p",
+    [(True, 1 / 1), (False, 1 / 2)],
+)
+def test_textcat_eval_missing(multi_label: bool, spring_p: float):
+    """
+    multi-label: the missing 'spring' in gold_doc_2 doesn't incur a penalty
+    exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0"""
+    train_examples = []
+    nlp = English()
+
+    ref1 = nlp("one")
+    ref1.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
+    pred1 = nlp("one")
+    pred1.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
+    train_examples.append(Example(ref1, pred1))
+
+    ref2 = nlp("two")
+    # reference 'spring' is missing, pred 'spring' is 1
+    ref2.cats = {"winter": 0.0, "summer": 0.0, "autumn": 1.0}
+    pred2 = nlp("two")
+    pred2.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
+    train_examples.append(Example(pred2, ref2))
+
+    scores = Scorer().score_cats(
+        train_examples,
+        "cats",
+        labels=["winter", "summer", "spring", "autumn"],
+        multi_label=multi_label,
+    )
+    assert scores["cats_f_per_type"]["spring"]["p"] == spring_p
+    assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 1
+
+
+@pytest.mark.parametrize(
+    "multi_label,expected_loss",
+    [(True, 0), (False, 0.125)],
+)
+def test_textcat_loss(multi_label: bool, expected_loss: float):
+    """
+    multi-label: the missing 'spring' in gold_doc_2 doesn't incur an increase in loss
+    exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss"""
+    train_examples = []
+    nlp = English()
+
+    doc1 = nlp("one")
+    cats1 = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
+    train_examples.append(Example.from_dict(doc1, {"cats": cats1}))
+
+    doc2 = nlp("two")
+    cats2 = {"winter": 0.0, "summer": 0.0, "autumn": 1.0}
+    train_examples.append(Example.from_dict(doc2, {"cats": cats2}))
+
+    if multi_label:
+        textcat = nlp.add_pipe("textcat_multilabel")
+    else:
+        textcat = nlp.add_pipe("textcat")
+    textcat.initialize(lambda: train_examples)
+    assert isinstance(textcat, TextCategorizer)
+    scores = textcat.model.ops.asarray(
+        [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0]], dtype="f"  # type: ignore
+    )
+    loss, d_scores = textcat.get_loss(train_examples, scores)
+    assert loss == expected_loss
+
+
 def test_textcat_threshold():
     # Ensure the scorer can be called with a different threshold
     nlp = English()
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 47f868637..2ff569bad 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -34,7 +34,11 @@ only.
 Predictions will be saved to `doc.cats` as a dictionary, where the key is the
 name of the category and the value is a score between 0 and 1 (inclusive). For
 `textcat` (exclusive categories), the scores will sum to 1, while for
-`textcat_multilabel` there is no particular guarantee about their sum.
+`textcat_multilabel` there is no particular guarantee about their sum. This also
+means that for `textcat`, missing values are equated to a value of 0 (i.e.
+`False`) and are counted as such towards the loss and scoring metrics. This is
+not the case for `textcat_multilabel`, where missing values in the gold standard
+data do not influence the loss or accuracy calculations.
 
 Note that when assigning values to create training data, the score of each
 category must be 0 or 1. Using other values, for example to create a document

From 176a90edeec38ced8c5b1e2f7fd1d28bf1e9e1c1 Mon Sep 17 00:00:00 2001
From: jsnfly <37632631+jsnfly@users.noreply.github.com>
Date: Thu, 13 Jan 2022 09:03:23 +0100
Subject: [PATCH 009/123] Fix texcat loss scaling (#9904) (#10002)

* add failing test for issue 9904

* remove division by batch size and summation before applying the mean

Co-authored-by: jonas <jsnfly@gmx.de>
---
 spacy/pipeline/textcat.py            |  4 ++--
 spacy/tests/pipeline/test_textcat.py | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index e20ae87f1..dd5fdc078 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -320,9 +320,9 @@ class TextCategorizer(TrainablePipe):
         self._validate_categories(examples)
         truths, not_missing = self._examples_to_truth(examples)
         not_missing = self.model.ops.asarray(not_missing)  # type: ignore
-        d_scores = (scores - truths) / scores.shape[0]
+        d_scores = (scores - truths)
         d_scores *= not_missing
-        mean_square_error = (d_scores ** 2).sum(axis=1).mean()
+        mean_square_error = (d_scores ** 2).mean()
         return float(mean_square_error), d_scores
 
     def add_label(self, label: str) -> int:
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 52bf6ec5c..798dd165e 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -277,6 +277,21 @@ def test_issue7019():
     print_prf_per_type(msg, scores, name="foo", type="bar")
 
 
+@pytest.mark.issue(9904)
+def test_issue9904():
+    nlp = Language()
+    textcat = nlp.add_pipe("textcat")
+    get_examples = make_get_examples_single_label(nlp)
+    nlp.initialize(get_examples)
+
+    examples = get_examples()
+    scores = textcat.predict([eg.predicted for eg in examples])
+
+    loss = textcat.get_loss(examples, scores)[0]
+    loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
+    assert loss == pytest.approx(loss_double_bs)
+
+
 @pytest.mark.skip(reason="Test is flakey when run with others")
 def test_simple_train():
     nlp = Language()

From 677c1a35072ff2deb3af6638802f506d623ed8f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 13 Jan 2022 09:03:55 +0100
Subject: [PATCH 010/123] Speed up the StateC::L feature function (#10019)

* Speed up the StateC::L feature function

This function gets the n-th most-recent left-arc with a particular head.
Before this change, StateC::L would construct a vector of all left-arcs
with the given head and then pick the n-th most recent from that vector.
Since the number of left-arcs strongly correlates with the doc length
and the feature is constructed for every transition, this can make
transition-parsing quadratic.

With this change StateC::L:

- Searches left-arcs backwards.
- Stops early when the n-th matching transition is found.
- Does not construct a vector (reducing memory pressure).

This change doesn't avoid the linear search when the transition that is
queried does not occur in the left-arcs. Regardless, performance is
improved quite a bit with very long docs:

Before:

   N  Time

 400   3.3
 800   5.4
1600  11.6
3200  30.7

After:

   N  Time

 400   3.2
 800   5.0
1600   9.5
3200  23.2

We can probably do better with more tailored data structures, but I
first wanted to make a low-impact PR.

Found while investigating #9858.

* StateC::L: simplify loop
---
 spacy/pipeline/_parser_internals/_state.pxd | 23 +++++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 161f3ca48..27623e7c6 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,3 +1,4 @@
+from cython.operator cimport dereference as deref, preincrement as incr
 from libc.string cimport memcpy, memset
 from libc.stdlib cimport calloc, free
 from libc.stdint cimport uint32_t, uint64_t
@@ -184,16 +185,20 @@ cdef cppclass StateC:
     int L(int head, int idx) nogil const:
         if idx < 1 or this._left_arcs.size() == 0:
             return -1
-        cdef vector[int] lefts
-        for i in range(this._left_arcs.size()):
-            arc = this._left_arcs.at(i)
+
+        # Work backwards through left-arcs to find the arc at the
+        # requested index more quickly.
+        cdef size_t child_index = 0
+        it = this._left_arcs.const_rbegin()
+        while it != this._left_arcs.rend():
+            arc = deref(it)
             if arc.head == head and arc.child != -1 and arc.child < head:
-                lefts.push_back(arc.child)
-        idx = (<int>lefts.size()) - idx
-        if idx < 0:
-            return -1
-        else:
-            return lefts.at(idx)
+                child_index += 1
+                if child_index == idx:
+                    return arc.child
+            incr(it)
+
+        return -1
 
     int R(int head, int idx) nogil const:
         if idx < 1 or this._right_arcs.size() == 0:

From 63fa55089dff3b5a5208c24914cd0faa5909108a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 13 Jan 2022 10:33:30 +0100
Subject: [PATCH 011/123] Use constant-time head lookups in StateC::{L,R}

This change changes the type of left/right-arc collections from
vector[ArcC] to unordered_map[int, vector[Arc]], so that the arcs are
keyed by the head. This allows us to find all the left/right arcs for a
particular head in constant time in StateC::{L,R}.

Benchmarks with long docs (N is the number of text repetitions):

Before (using #10019):

    N  Time (s)

  400   3.2
  800   5.0
 1600   9.5
 3200  23.2
 6400  66.8
12800  220.0

After (this commit):

   N   Time (s)

  400   3.1
  800   4.3
 1600   6.7
 3200  12.0
 6400  22.0
12800  42.0

Related to #9858 and #10019.
---
 spacy/pipeline/_parser_internals/_state.pxd | 120 ++++++++++++--------
 1 file changed, 70 insertions(+), 50 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 27623e7c6..a1262bb61 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -3,6 +3,7 @@ from libc.string cimport memcpy, memset
 from libc.stdlib cimport calloc, free
 from libc.stdint cimport uint32_t, uint64_t
 cimport libcpp
+from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from libcpp.set cimport set
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
@@ -30,8 +31,8 @@ cdef cppclass StateC:
     vector[int] _stack
     vector[int] _rebuffer
     vector[SpanC] _ents
-    vector[ArcC] _left_arcs
-    vector[ArcC] _right_arcs
+    unordered_map[int, vector[ArcC]] _left_arcs
+    unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
     set[int] _sent_starts
     TokenC _empty_token
@@ -160,15 +161,22 @@ cdef cppclass StateC:
         else:
             return &this._sent[i]
 
-    void get_arcs(vector[ArcC]* arcs) nogil const:
-        for i in range(this._left_arcs.size()):
-            arc = this._left_arcs.at(i)
-            if arc.head != -1 and arc.child != -1:
-                arcs.push_back(arc)
-        for i in range(this._right_arcs.size()):
-            arc = this._right_arcs.at(i)
-            if arc.head != -1 and arc.child != -1:
-                arcs.push_back(arc)
+    void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) nogil const:
+        cdef const vector[ArcC]* arcs
+        head_arcs_it = heads_arcs.const_begin()
+        while head_arcs_it != heads_arcs.const_end():
+            arcs = &deref(head_arcs_it).second
+            arcs_it = arcs.const_begin()
+            while arcs_it != arcs.const_end():
+                arc = deref(arcs_it)
+                if arc.head != -1 and arc.child != -1:
+                    out.push_back(arc)
+                incr(arcs_it)
+            incr(head_arcs_it)
+
+    void get_arcs(vector[ArcC]* out) nogil const:
+        this.map_get_arcs(this._left_arcs, out)
+        this.map_get_arcs(this._right_arcs, out)
 
     int H(int child) nogil const:
         if child >= this.length or child < 0:
@@ -182,37 +190,35 @@ cdef cppclass StateC:
         else:
             return this._ents.back().start
 
-    int L(int head, int idx) nogil const:
-        if idx < 1 or this._left_arcs.size() == 0:
+    int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) nogil const:
+        if idx < 1:
             return -1
 
-        # Work backwards through left-arcs to find the arc at the
+        head_arcs_it = heads_arcs.const_find(head)
+        if head_arcs_it == heads_arcs.const_end():
+            return -1
+
+        cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
+
+        # Work backwards through arcs to find the arc at the
         # requested index more quickly.
         cdef size_t child_index = 0
-        it = this._left_arcs.const_rbegin()
-        while it != this._left_arcs.rend():
-            arc = deref(it)
-            if arc.head == head and arc.child != -1 and arc.child < head:
+        arcs_it = arcs.const_rbegin()
+        while arcs_it != arcs.const_rend() and child_index != idx:
+            arc = deref(arcs_it)
+            if arc.child != -1:
                 child_index += 1
                 if child_index == idx:
                     return arc.child
-            incr(it)
+            incr(arcs_it)
 
         return -1
 
+    int L(int head, int idx) nogil const:
+        return this.nth_child(this._left_arcs, head, idx)
+
     int R(int head, int idx) nogil const:
-        if idx < 1 or this._right_arcs.size() == 0:
-            return -1
-        cdef vector[int] rights
-        for i in range(this._right_arcs.size()):
-            arc = this._right_arcs.at(i)
-            if arc.head == head and arc.child != -1 and arc.child > head:
-                rights.push_back(arc.child)
-        idx = (<int>rights.size()) - idx
-        if idx < 0:
-            return -1
-        else:
-            return rights.at(idx)
+        return this.nth_child(this._right_arcs, head, idx)
 
     bint empty() nogil const:
         return this._stack.size() == 0
@@ -253,22 +259,29 @@ cdef cppclass StateC:
 
     int r_edge(int word) nogil const:
         return word
- 
-    int n_L(int head) nogil const:
+
+    int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) nogil const:
         cdef int n = 0
-        for i in range(this._left_arcs.size()):
-            arc = this._left_arcs.at(i) 
-            if arc.head == head and arc.child != -1 and arc.child < arc.head:
+        head_arcs_it = heads_arcs.const_find(head)
+        if head_arcs_it == heads_arcs.const_end():
+            return n
+
+        cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
+        arcs_it = arcs.const_begin()
+        while arcs_it != arcs.end():
+            arc = deref(arcs_it)
+            if arc.child != -1:
                 n += 1
+            incr(arcs_it)
+
         return n
 
+
+    int n_L(int head) nogil const:
+        return n_arcs(this._left_arcs, head)
+
     int n_R(int head) nogil const:
-        cdef int n = 0
-        for i in range(this._right_arcs.size()):
-            arc = this._right_arcs.at(i) 
-            if arc.head == head and arc.child != -1 and arc.child > arc.head:
-                n += 1
-        return n
+        return n_arcs(this._right_arcs, head)
 
     bint stack_is_connected() nogil const:
         return False
@@ -328,19 +341,20 @@ cdef cppclass StateC:
         arc.child = child
         arc.label = label
         if head > child:
-            this._left_arcs.push_back(arc)
+            this._left_arcs[arc.head].push_back(arc)
         else:
-            this._right_arcs.push_back(arc)
+            this._right_arcs[arc.head].push_back(arc)
         this._heads[child] = head
 
-    void del_arc(int h_i, int c_i) nogil:
-        cdef vector[ArcC]* arcs
-        if h_i > c_i:
-            arcs = &this._left_arcs
-        else:
-            arcs = &this._right_arcs
+    void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
+        arcs_it = heads_arcs.find(h_i)
+        if arcs_it == heads_arcs.end():
+            return
+
+        arcs = &deref(arcs_it).second
         if arcs.size() == 0:
             return
+
         arc = arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
@@ -353,6 +367,12 @@ cdef cppclass StateC:
                     arc.label = 0
                     break
 
+    void del_arc(int h_i, int c_i) nogil:
+        if h_i > c_i:
+            this.map_del_arc(&this._left_arcs, h_i, c_i)
+        else:
+            this.map_del_arc(&this._right_arcs, h_i, c_i)
+
     SpanC get_ent() nogil const:
         cdef SpanC ent
         if this._ents.size() == 0:

From 47ea6704f1045ee3a04ac7ffbfedba01d944e233 Mon Sep 17 00:00:00 2001
From: Natalia Rodnova <4512370+nrodnova@users.noreply.github.com>
Date: Mon, 17 Jan 2022 03:17:49 -0700
Subject: [PATCH 012/123] Span richcmp fix (#9956)

* Corrected Span's __richcmp__ implementation to take end, label and kb_id in consideration

* Updated test

* Updated test

* Removed formatting from a test for readability sake

* Use same tuples for all comparisons

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/tests/doc/test_span.py | 49 ++++++++++++++++++++++++++++++++++++
 spacy/tokens/span.pyx        | 28 ++++++---------------
 2 files changed, 57 insertions(+), 20 deletions(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 10aba5b94..bdf34c1c1 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -573,6 +573,55 @@ def test_span_with_vectors(doc):
     doc.vocab.vectors = prev_vectors
 
 
+# fmt: off
+def test_span_comparison(doc):
+
+    # Identical start, end, only differ in label and kb_id
+    assert Span(doc, 0, 3) == Span(doc, 0, 3)
+    assert Span(doc, 0, 3, "LABEL") == Span(doc, 0, 3, "LABEL")
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") == Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+
+    assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL")
+    assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+    assert Span(doc, 0, 3, "LABEL") != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+
+    assert Span(doc, 0, 3) <= Span(doc, 0, 3) and Span(doc, 0, 3) >= Span(doc, 0, 3)
+    assert Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL") and Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "LABEL")
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+
+    assert (Span(doc, 0, 3) < Span(doc, 0, 3, "", kb_id="KB_ID") < Span(doc, 0, 3, "LABEL") < Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
+    assert (Span(doc, 0, 3) <= Span(doc, 0, 3, "", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
+
+    assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") > Span(doc, 0, 3, "LABEL") > Span(doc, 0, 3, "", kb_id="KB_ID") > Span(doc, 0, 3))
+    assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "", kb_id="KB_ID") >= Span(doc, 0, 3))
+
+    # Different end
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
+
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4)
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 4)
+    assert Span(doc, 0, 4) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+    assert Span(doc, 0, 4) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+
+    # Different start
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
+
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
+    assert Span(doc, 1, 3) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+    assert Span(doc, 1, 3) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+
+    # Different start & different end
+    assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
+
+    assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
+    assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
+    assert Span(doc, 1, 3) > Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
+    assert Span(doc, 1, 3) >= Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
+# fmt: on
+
+
 @pytest.mark.parametrize(
     "start,end,expected_sentences,expected_sentences_with_hook",
     [
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index cd02cab36..5484b25fd 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -126,38 +126,26 @@ cdef class Span:
                 return False
             else:
                 return True
+        self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.doc)
+        other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.doc)
         # <
         if op == 0:
-            return self.c.start_char < other.c.start_char
+            return self_tuple < other_tuple
         # <=
         elif op == 1:
-            return self.c.start_char <= other.c.start_char
+            return self_tuple <= other_tuple
         # ==
         elif op == 2:
-            # Do the cheap comparisons first
-            return (
-                (self.c.start_char == other.c.start_char) and \
-                (self.c.end_char == other.c.end_char) and \
-                (self.c.label == other.c.label) and \
-                (self.c.kb_id == other.c.kb_id) and \
-                (self.doc == other.doc)
-            )
+            return self_tuple == other_tuple
         # !=
         elif op == 3:
-            # Do the cheap comparisons first
-            return not (
-                (self.c.start_char == other.c.start_char) and \
-                (self.c.end_char == other.c.end_char) and \
-                (self.c.label == other.c.label) and \
-                (self.c.kb_id == other.c.kb_id) and \
-                (self.doc == other.doc)
-            )
+            return self_tuple != other_tuple
         # >
         elif op == 4:
-            return self.c.start_char > other.c.start_char
+            return self_tuple > other_tuple
         # >=
         elif op == 5:
-            return self.c.start_char >= other.c.start_char
+            return self_tuple >= other_tuple
 
     def __hash__(self):
         return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id))

From 2abd380f2d17010fe22fe52e8aab529d70cbeec6 Mon Sep 17 00:00:00 2001
From: pepemedigu <pepe.medina@uca.es>
Date: Thu, 20 Jan 2022 15:44:13 +0100
Subject: [PATCH 013/123] Update lex_attrs.py for Spanish with ordinals
 (#10038)

* Update lex_attrs.py

Add ordinal words

* black formatting

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/lang/es/lex_attrs.py | 41 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py
index 988dbaba1..9d1fa93b8 100644
--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@@ -47,6 +47,41 @@ _num_words = [
 ]
 
 
+_ordinal_words = [
+    "primero",
+    "segundo",
+    "tercero",
+    "cuarto",
+    "quinto",
+    "sexto",
+    "séptimo",
+    "octavo",
+    "noveno",
+    "décimo",
+    "undécimo",
+    "duodécimo",
+    "decimotercero",
+    "decimocuarto",
+    "decimoquinto",
+    "decimosexto",
+    "decimoséptimo",
+    "decimoctavo",
+    "decimonoveno",
+    "vigésimo",
+    "trigésimo",
+    "cuadragésimo",
+    "quincuagésimo",
+    "sexagésimo",
+    "septuagésimo",
+    "octogésima",
+    "nonagésima",
+    "centésima",
+    "milésima",
+    "millonésima",
+    "billonésima",
+]
+
+
 def like_num(text):
     if text.startswith(("+", "-", "±", "~")):
         text = text[1:]
@@ -57,7 +92,11 @@ def like_num(text):
         num, denom = text.split("/")
         if num.isdigit() and denom.isdigit():
             return True
-    if text.lower() in _num_words:
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+    # Check ordinal number
+    if text_lower in _ordinal_words:
         return True
     return False
 

From fc3d446c7188e128f851b2a9c8ec446748bdc02f Mon Sep 17 00:00:00 2001
From: Evgen Kytonin <killfess@gmail.com>
Date: Tue, 1 Feb 2022 13:24:00 +0200
Subject: [PATCH 014/123] Update Ukrainian tokenizer_exceptions

---
 spacy/lang/uk/tokenizer_exceptions.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py
index 94016fd52..7e168a27c 100644
--- a/spacy/lang/uk/tokenizer_exceptions.py
+++ b/spacy/lang/uk/tokenizer_exceptions.py
@@ -6,19 +6,30 @@ from ...util import update_exc
 _exc = {}
 
 for exc_data in [
+    {ORTH: "обл.", NORM: "область"},
+    {ORTH: "р-н.", NORM: "район"},
+    {ORTH: "р-н", NORM: "район"},
+    {ORTH: "м.", NORM: "місто"},
     {ORTH: "вул.", NORM: "вулиця"},
-    {ORTH: "ім.", NORM: "імені"},
     {ORTH: "просп.", NORM: "проспект"},
+    {ORTH: "пр-кт", NORM: "проспект"},
     {ORTH: "бул.", NORM: "бульвар"},
     {ORTH: "пров.", NORM: "провулок"},
     {ORTH: "пл.", NORM: "площа"},
+    {ORTH: "майд.", NORM: "майдан"},
+    {ORTH: "мкр.", NORM: "мікрорайон"},
+    {ORTH: "ст.", NORM: "станція"},
+    {ORTH: "ж/м", NORM: "житловий масив"},
+    {ORTH: "наб.", NORM: "набережна"},
+    {ORTH: "в/ч", NORM: "військова частина"},
+    {ORTH: "в/м", NORM: "військове містечко"},
+    {ORTH: "оз.", NORM: "озеро"},
+    {ORTH: "ім.", NORM: "імені"},
     {ORTH: "г.", NORM: "гора"},
     {ORTH: "п.", NORM: "пан"},
-    {ORTH: "м.", NORM: "місто"},
     {ORTH: "проф.", NORM: "професор"},
     {ORTH: "акад.", NORM: "академік"},
     {ORTH: "доц.", NORM: "доцент"},
-    {ORTH: "оз.", NORM: "озеро"},
 ]:
     _exc[exc_data[ORTH]] = [exc_data]
 

From fef896ce49093357247d223e4f4d65d8811ac380 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 3 Feb 2022 17:01:53 +0100
Subject: [PATCH 015/123] Allow Example to align whitespace annotation (#10189)

Remove exception for whitespace tokens in `Example.get_aligned` so that
annotation on whitespace tokens is aligned in the same way as for
non-whitespace tokens.
---
 spacy/tests/training/test_new_example.py | 10 ++++++++++
 spacy/training/example.pyx               | 21 +++++++++------------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py
index 4dd90f416..a39d40ded 100644
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@@ -421,3 +421,13 @@ def test_Example_missing_heads():
     # Ensure that the missing head doesn't create an artificial new sentence start
     expected = [True, False, False, False, False, False]
     assert example.get_aligned_sent_starts() == expected
+
+
+def test_Example_aligned_whitespace(en_vocab):
+    words = ["a", " ", "b"]
+    tags = ["A", "SPACE", "B"]
+    predicted = Doc(en_vocab, words=words)
+    reference = Doc(en_vocab, words=words, tags=tags)
+
+    example = Example(predicted, reference)
+    assert example.get_aligned("TAG", as_string=True) == tags
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 732203e7b..d792c9bbf 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -159,20 +159,17 @@ cdef class Example:
         gold_values = self.reference.to_array([field])
         output = [None] * len(self.predicted)
         for token in self.predicted:
-            if token.is_space:
+            values = gold_values[align[token.i].dataXd]
+            values = values.ravel()
+            if len(values) == 0:
                 output[token.i] = None
+            elif len(values) == 1:
+                output[token.i] = values[0]
+            elif len(set(list(values))) == 1:
+                # If all aligned tokens have the same value, use it.
+                output[token.i] = values[0]
             else:
-                values = gold_values[align[token.i].dataXd]
-                values = values.ravel()
-                if len(values) == 0:
-                    output[token.i] = None
-                elif len(values) == 1:
-                    output[token.i] = values[0]
-                elif len(set(list(values))) == 1:
-                    # If all aligned tokens have the same value, use it.
-                    output[token.i] = values[0]
-                else:
-                    output[token.i] = None
+                output[token.i] = None
         if as_string and field not in ["ENT_IOB", "SENT_START"]:
             output = [vocab.strings[o] if o is not None else o for o in output]
         return output

From e9c26f2ee9f03c2aa6b7cd724f4c0b3717507211 Mon Sep 17 00:00:00 2001
From: Antti Ajanki <antti.ajanki@iki.fi>
Date: Tue, 8 Feb 2022 09:44:11 +0200
Subject: [PATCH 016/123] Add a noun chunker for Finnish (#10214)

with test cases
---
 spacy/lang/fi/__init__.py               |   2 +
 spacy/lang/fi/syntax_iterators.py       |  79 +++++++++++
 spacy/tests/lang/fi/test_noun_chunks.py | 174 ++++++++++++++++++++++++
 3 files changed, 255 insertions(+)
 create mode 100644 spacy/lang/fi/syntax_iterators.py
 create mode 100644 spacy/tests/lang/fi/test_noun_chunks.py

diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py
index 86a834170..c3a0cf451 100644
--- a/spacy/lang/fi/__init__.py
+++ b/spacy/lang/fi/__init__.py
@@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language, BaseDefaults
 
 
@@ -11,6 +12,7 @@ class FinnishDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
+    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Finnish(Language):
diff --git a/spacy/lang/fi/syntax_iterators.py b/spacy/lang/fi/syntax_iterators.py
new file mode 100644
index 000000000..6b481e51f
--- /dev/null
+++ b/spacy/lang/fi/syntax_iterators.py
@@ -0,0 +1,79 @@
+from typing import Iterator, Tuple, Union
+from ...tokens import Doc, Span
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """Detect base noun phrases from a dependency parse. Works on both Doc and Span."""
+    labels = [
+        "appos",
+        "nsubj",
+        "nsubj:cop",
+        "obj",
+        "obl",
+        "ROOT",
+    ]
+    extend_labels = [
+        "amod",
+        "compound",
+        "compound:nn",
+        "flat:name",
+        "nmod",
+        "nmod:gobj",
+        "nmod:gsubj",
+        "nmod:poss",
+        "nummod",
+    ]
+
+    def potential_np_head(word):
+        return word.pos in (NOUN, PROPN) and (
+            word.dep in np_deps or word.head.pos == PRON
+        )
+
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    extend_deps = [doc.vocab.strings[label] for label in extend_labels]
+    np_label = doc.vocab.strings.add("NP")
+    conj_label = doc.vocab.strings.add("conj")
+
+    rbracket = 0
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if i < rbracket:
+            continue
+
+        # Is this a potential independent NP head or coordinated with
+        # a NOUN that is itself an independent NP head?
+        #
+        # e.g. "Terveyden ja hyvinvoinnin laitos"
+        if potential_np_head(word) or (
+            word.dep == conj_label and potential_np_head(word.head)
+        ):
+            # Try to extend to the left to include adjective/num
+            # modifiers, compound words etc.
+            lbracket = word.i
+            for ldep in word.lefts:
+                if ldep.dep in extend_deps:
+                    lbracket = ldep.left_edge.i
+                    break
+
+            # Prevent nested chunks from being produced
+            if lbracket <= prev_end:
+                continue
+
+            rbracket = word.i
+            # Try to extend the span to the right to capture
+            # appositions and noun modifiers
+            for rdep in word.rights:
+                if rdep.dep in extend_deps:
+                    rbracket = rdep.i
+            prev_end = rbracket
+
+            yield lbracket, rbracket + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/tests/lang/fi/test_noun_chunks.py b/spacy/tests/lang/fi/test_noun_chunks.py
new file mode 100644
index 000000000..cc3b5aa36
--- /dev/null
+++ b/spacy/tests/lang/fi/test_noun_chunks.py
@@ -0,0 +1,174 @@
+import pytest
+from spacy.tokens import Doc
+
+
+FI_NP_TEST_EXAMPLES = [
+    (
+        "Kaksi tyttöä potkii punaista palloa",
+        ["NUM", "NOUN", "VERB", "ADJ", "NOUN"],
+        ["nummod", "nsubj", "ROOT", "amod", "obj"],
+        [1, 1, 0, 1, -2],
+        ["Kaksi tyttöä", "punaista palloa"],
+    ),
+    (
+        "Erittäin vaarallinen leijona karkasi kiertävän sirkuksen eläintenkesyttäjältä",
+        ["ADV", "ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"],
+        ["advmod", "amod", "nsubj", "ROOT", "amod", "nmod:poss", "obl"],
+        [1, 1, 1, 0, 1, 1, -3],
+        ["Erittäin vaarallinen leijona", "kiertävän sirkuksen eläintenkesyttäjältä"],
+    ),
+    (
+        "Leijona raidallisine tassuineen piileksii Porin kaupungin lähellä",
+        ["NOUN", "ADJ", "NOUN", "VERB", "PROPN", "NOUN", "ADP"],
+        ["nsubj", "amod", "nmod", "ROOT", "nmod:poss", "obl", "case"],
+        [3, 1, -2, 0, 1, -2, -1],
+        ["Leijona raidallisine tassuineen", "Porin kaupungin"],
+    ),
+    (
+        "Lounaalla nautittiin salaattia, maukasta kanaa ja raikasta vettä",
+        ["NOUN", "VERB", "NOUN", "PUNCT", "ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"],
+        ["obl", "ROOT", "obj", "punct", "amod", "conj", "cc", "amod", "conj"],
+        [1, 0, -1, 2, 1, -3, 2, 1, -6],
+        ["Lounaalla", "salaattia", "maukasta kanaa", "raikasta vettä"],
+    ),
+    (
+        "Minua houkuttaa maalle muuttaminen talven jälkeen",
+        ["PRON", "VERB", "NOUN", "NOUN", "NOUN", "ADP"],
+        ["obj", "ROOT", "nmod", "nsubj", "obl", "case"],
+        [1, 0, 1, -2, -3, -1],
+        ["maalle muuttaminen", "talven"],
+    ),
+    (
+        "Päivän kohokohta oli vierailu museossa kummilasten kanssa",
+        ["NOUN", "NOUN", "AUX", "NOUN", "NOUN", "NOUN", "ADP"],
+        ["nmod:poss", "nsubj:cop", "cop", "ROOT", "nmod", "obl", "case"],
+        [1, 2, 1, 0, -1, -2, -1],
+        ["Päivän kohokohta", "vierailu museossa", "kummilasten"],
+    ),
+    (
+        "Yrittäjät maksoivat tuomioistuimen määräämät korvaukset",
+        ["NOUN", "VERB", "NOUN", "VERB", "NOUN"],
+        ["nsubj", "ROOT", "nsubj", "acl", "obj"],
+        [1, 0, 1, 1, -3],
+        ["Yrittäjät", "tuomioistuimen", "korvaukset"],
+    ),
+    (
+        "Julkisoikeudelliset tai niihin rinnastettavat saatavat ovat suoraan ulosottokelpoisia",
+        ["ADJ", "CCONJ", "PRON", "VERB", "NOUN", "AUX", "ADV", "NOUN"],
+        ["amod", "cc", "obl", "acl", "nsubj:cop", "cop", "advmod", "ROOT"],
+        [4, 3, 1, 1, 3, 2, 1, 0],
+        ["Julkisoikeudelliset tai niihin rinnastettavat saatavat", "ulosottokelpoisia"],
+    ),
+    (
+        "Se oli ala-arvoista käytöstä kaikilta oppilailta, myös valvojaoppilailta",
+        ["PRON", "AUX", "ADJ", "NOUN", "PRON", "NOUN", "PUNCT", "ADV", "NOUN"],
+        ["nsubj:cop", "cop", "amod", "ROOT", "det", "nmod", "punct", "advmod", "appos"],
+        [3, 2, 1, 0, 1, -2, 2, 1, -3],
+        ["ala-arvoista käytöstä kaikilta oppilailta", "valvojaoppilailta"],
+    ),
+    (
+        "Isä souti veneellä, jonka hän oli vuokrannut",
+        ["NOUN", "VERB", "NOUN", "PUNCT", "PRON", "PRON", "AUX", "VERB"],
+        ["nsubj", "ROOT", "obl", "punct", "obj", "nsubj", "aux", "acl:relcl"],
+        [1, 0, -1, 4, 3, 2, 1, -5],
+        ["Isä", "veneellä"],
+    ),
+    (
+        "Kirja, jonka poimin hyllystä, kertoo norsuista",
+        ["NOUN", "PUNCT", "PRON", "VERB", "NOUN", "PUNCT", "VERB", "NOUN"],
+        ["nsubj", "punct", "obj", "acl:relcl", "obl", "punct", "ROOT", "obl"],
+        [6, 2, 1, -3, -1, 1, 0, -1],
+        ["Kirja", "hyllystä", "norsuista"],
+    ),
+    (
+        "Huomenna on päivä, jota olemme odottaneet",
+        ["NOUN", "AUX", "NOUN", "PUNCT", "PRON", "AUX", "VERB"],
+        ["ROOT", "cop", "nsubj:cop", "punct", "obj", "aux", "acl:relcl"],
+        [0, -1, -2, 3, 2, 1, -4],
+        ["Huomenna", "päivä"],
+    ),
+    (
+        "Liikkuvuuden lisääminen on yksi korkeakoulutuksen keskeisistä kehittämiskohteista",
+        ["NOUN", "NOUN", "AUX", "PRON", "NOUN", "ADJ", "NOUN"],
+        ["nmod:gobj", "nsubj:cop", "cop", "ROOT", "nmod:poss", "amod", "nmod"],
+        [1, 2, 1, 0, 2, 1, -3],
+        [
+            "Liikkuvuuden lisääminen",
+            "korkeakoulutuksen keskeisistä kehittämiskohteista",
+        ],
+    ),
+    (
+        "Kaupalliset palvelut jätetään yksityisten palveluntarjoajien tarjottavaksi",
+        ["ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"],
+        ["amod", "obj", "ROOT", "amod", "nmod:gsubj", "obl"],
+        [1, 1, 0, 1, 1, -3],
+        ["Kaupalliset palvelut", "yksityisten palveluntarjoajien tarjottavaksi"],
+    ),
+    (
+        "New York tunnetaan kaupunkina, joka ei koskaan nuku",
+        ["PROPN", "PROPN", "VERB", "NOUN", "PUNCT", "PRON", "AUX", "ADV", "VERB"],
+        ["obj", "flat:name", "ROOT", "obl", "punct", "nsubj", "aux", "advmod", "acl:relcl"],
+        [2, -1, 0, -1, 4, 3, 2, 1, -5],
+        ["New York", "kaupunkina"],
+    ),
+    (
+        "Loput vihjeet saat herra Möttöseltä",
+        ["NOUN", "NOUN", "VERB", "NOUN", "PROPN"],
+        ["compound:nn", "obj", "ROOT", "compound:nn", "obj"],
+        [1, 1, 0, 1, -2],
+        ["Loput vihjeet", "herra Möttöseltä"],
+    ),
+    (
+        "mahdollisuus tukea muita päivystysyksiköitä",
+        ["NOUN", "VERB", "PRON", "NOUN"],
+        ["ROOT", "acl", "det", "obj"],
+        [0, -1, 1, -2],
+        ["mahdollisuus", "päivystysyksiköitä"],
+    ),
+    (
+        "sairaanhoitopiirit harjoittavat leikkaustoimintaa alueellaan useammassa sairaalassa",
+        ["NOUN", "VERB", "NOUN", "NOUN", "ADJ", "NOUN"],
+        ["nsubj", "ROOT", "obj", "obl", "amod", "obl"],
+        [1, 0, -1, -1, 1, -3],
+        ["sairaanhoitopiirit", "leikkaustoimintaa", "alueellaan", "useammassa sairaalassa"],
+    ),
+    (
+        "Lain mukaan varhaiskasvatus on suunnitelmallista toimintaa",
+        ["NOUN", "ADP", "NOUN", "AUX", "ADJ", "NOUN"],
+        ["obl", "case", "nsubj:cop", "cop", "amod", "ROOT"],
+        [5, -1, 3, 2, 1, 0],
+        ["Lain", "varhaiskasvatus", "suunnitelmallista toimintaa"],
+    ),
+]
+
+
+def test_noun_chunks_is_parsed(fi_tokenizer):
+    """Test that noun_chunks raises Value Error for 'fi' language if Doc is not parsed.
+    To check this test, we're constructing a Doc
+    with a new Vocab here and forcing is_parsed to 'False'
+    to make sure the noun chunks don't run.
+    """
+    doc = fi_tokenizer("Tämä on testi")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)
+
+
+@pytest.mark.parametrize(
+    "text,pos,deps,heads,expected_noun_chunks", FI_NP_TEST_EXAMPLES
+)
+def test_fi_noun_chunks(fi_tokenizer, text, pos, deps, heads, expected_noun_chunks):
+    tokens = fi_tokenizer(text)
+
+    assert len(heads) == len(pos)
+    doc = Doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=[head + i for i, head in enumerate(heads)],
+        deps=deps,
+        pos=pos,
+    )
+
+    noun_chunks = list(doc.noun_chunks)
+    assert len(noun_chunks) == len(expected_noun_chunks)
+    for i, np in enumerate(noun_chunks):
+        assert np.text == expected_noun_chunks[i]

From bbaf41fb3b1b0123455b93d7b97a9ef5d886f8b1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Feb 2022 11:45:26 +0100
Subject: [PATCH 017/123] Set version to v3.2.2 (#10262)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index c253d5052..d01b278c9 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.2.1"
+__version__ = "3.2.2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 9a06a210ec8ef2a6cd93f4572c3dd18c2532ca71 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Feb 2022 14:22:43 +0100
Subject: [PATCH 018/123] Exclude github workflow edits from CI (#10261)

---
 azure-pipelines.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 71a793911..8e322f3dd 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -17,6 +17,7 @@ pr:
       - "*.md"
       - "website/docs/*"
       - "website/src/*"
+      - ".github/workflows/*"
 
 jobs:
   # Perform basic checks for most important errors (syntax etc.) Uses the config

From 5adedb8587818741dcd4ee1364ffb3f7d5074e75 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 11 Feb 2022 14:23:01 +0100
Subject: [PATCH 019/123] Auto-format code with black (#10260)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 spacy/tests/test_cli.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 9d3f1ee71..fc35ff86e 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -706,17 +706,27 @@ def test_permitted_package_names():
     assert _is_permitted_package_name("-package") == False
     assert _is_permitted_package_name("package-") == False
 
-    
+
 def test_debug_data_compile_gold():
     nlp = English()
     pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
-    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
+    ref = Doc(
+        nlp.vocab,
+        words=["Token", ".", "New York City"],
+        sent_starts=[True, False, True],
+        ents=["O", "O", "B-ENT"],
+    )
     eg = Example(pred, ref)
     data = _compile_gold([eg], ["ner"], nlp, True)
     assert data["boundary_cross_ents"] == 0
 
     pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
-    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
+    ref = Doc(
+        nlp.vocab,
+        words=["Token", ".", "New York City"],
+        sent_starts=[True, False, True],
+        ents=["O", "B-ENT", "I-ENT"],
+    )
     eg = Example(pred, ref)
     data = _compile_gold([eg], ["ner"], nlp, True)
-    assert data["boundary_cross_ents"] == 1
\ No newline at end of file
+    assert data["boundary_cross_ents"] == 1

From 8818a44a39f6e8f5387680e28984897a60baa830 Mon Sep 17 00:00:00 2001
From: Markus Konrad <post@mkonrad.net>
Date: Mon, 14 Feb 2022 07:16:43 +0100
Subject: [PATCH 020/123] add tmtoolkit package to spaCy universe (#10245)

---
 website/meta/universe.json | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 4ded8880f..d7eef97e8 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -3792,6 +3792,39 @@
                 "twitter": "jboy"
             },
             "category": ["visualizers", "standalone"]
+        },
+        {
+            "id": "tmtoolkit",
+            "slogan": "Text mining and topic modeling toolkit",
+            "description": "tmtoolkit is a set of tools for text mining and topic modeling with Python developed especially for the use in the social sciences, in journalism or related disciplines. It aims for easy installation, extensive documentation and a clear programming interface while offering good performance on large datasets by the means of vectorized operations (via NumPy) and parallel computation (using Python’s multiprocessing module and the loky package).",
+            "github": "WZBSocialScienceCenter/tmtoolkit",
+            "code_example": [
+                "from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm",
+                "from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table",
+                "# load built-in sample dataset and use 4 worker processes",
+                "corp = Corpus.from_builtin_corpus('en-News100', max_workers=4)",
+                "# investigate corpus as dataframe",
+                "toktbl = tokens_table(corp)",
+                "print(toktbl)",
+                "# apply some text normalization",
+                "lemmatize(corp)",
+                "to_lowercase(corp)",
+                "# build sparse document-token matrix (DTM)",
+                "# document labels identify rows, vocabulary tokens identify columns",
+                "mat, doc_labels, vocab = dtm(corp, return_doc_labels=True, return_vocab=True)",
+                "# apply tf-idf transformation to DTM",
+                "# operation is applied on sparse matrix and uses few memory",
+                "tfidf_mat = tfidf(mat)",
+                "# show top 5 tokens per document ranked by tf-idf",
+                "top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=5)",
+                "print(top_tokens)"
+            ],
+            "author": "Markus Konrad / WZB Social Science Center",
+            "author_links": {
+                "github": "internaut",
+                "twitter": "_knrd"
+            },
+            "category": ["scientific", "standalone"]
         }
     ],
 

From 23bd103d8940c110e2588e7c93f8e33205e1b3be Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 14 Feb 2022 15:17:25 +0900
Subject: [PATCH 021/123] Add tmtoolkit setup steps

---
 website/meta/universe.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index d7eef97e8..122281583 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -3799,6 +3799,9 @@
             "description": "tmtoolkit is a set of tools for text mining and topic modeling with Python developed especially for the use in the social sciences, in journalism or related disciplines. It aims for easy installation, extensive documentation and a clear programming interface while offering good performance on large datasets by the means of vectorized operations (via NumPy) and parallel computation (using Python’s multiprocessing module and the loky package).",
             "github": "WZBSocialScienceCenter/tmtoolkit",
             "code_example": [
+                "# Note: This requires these setup steps:",
+                "#   pip install tmtoolkit[recommended]",
+                "#   python -m tmtoolkit setup en",
                 "from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm",
                 "from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table",
                 "# load built-in sample dataset and use 4 worker processes",

From f6250015ab4693131bde160ba5659151046cdd1d Mon Sep 17 00:00:00 2001
From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com>
Date: Tue, 15 Feb 2022 15:18:36 +0200
Subject: [PATCH 022/123] Fix the datemath for reals (#10294)

* add debugging branch and quotes to daily slowtest action

* Apparently the quotes fixed it
---
 .github/workflows/slowtests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml
index 9490b53bd..3b0f177a7 100644
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@@ -19,7 +19,7 @@ jobs:
         run: |
           today=$(date '+%Y-%m-%d %H:%M:%S')
           yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
-          if git log --after=$yesterday --before=$today | grep commit ; then
+          if git log --after="$yesterday" --before="$today" | grep commit ; then
             echo "::set-output name=run_tests::true"
           else
             echo "::set-output name=run_tests::false"

From 22066f4e0fd2a0685932b118bbc7501370c17dd9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Feb 2022 13:45:30 +0100
Subject: [PATCH 023/123] Also exclude workflows from non-PR CI runs (#10305)

---
 azure-pipelines.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 8e322f3dd..4624b2eb2 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -11,8 +11,9 @@ trigger:
     exclude:
       - "website/*"
       - "*.md"
+      - ".github/workflows/*"
 pr:
-   paths:
+  paths:
     exclude:
       - "*.md"
       - "website/docs/*"

From d30ee14ab3959addd726eee4555e5f07fe94f062 Mon Sep 17 00:00:00 2001
From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com>
Date: Wed, 16 Feb 2022 16:39:42 +0200
Subject: [PATCH 024/123] Pass the matrix branch to the checkout action
 (#10304)

---
 .github/workflows/slowtests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml
index 3b0f177a7..74f2b8998 100644
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@@ -14,6 +14,8 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v1
+        with:
+          ref: ${{ matrix.branch }}
       - name: Get commits from past 24 hours
         id: check_commits
         run: |

From fef768ef748d0526c53d147a38243c4dc84e0d28 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 16 Feb 2022 15:43:36 +0100
Subject: [PATCH 025/123] remove develop (not an active branch anymore)

---
 .github/workflows/slowtests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml
index 74f2b8998..1a99c751c 100644
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        branch: [master, develop, v4]
+        branch: [master, v4]
     runs-on: ubuntu-latest
     steps:
       - name: Checkout

From 26eac22d3b46131187c66f4d732603fb54610645 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 16 Feb 2022 15:44:05 +0100
Subject: [PATCH 026/123] remove develop also from GPU tests

---
 .github/workflows/gputests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml
index 14c1552bf..bb7f51d29 100644
--- a/.github/workflows/gputests.yml
+++ b/.github/workflows/gputests.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        branch: [master, develop, v4]
+        branch: [master, v4]
     runs-on: ubuntu-latest
     steps:
       - name: Trigger buildkite build

From da7520a83c6ec6ec22f74bcc265b57620f3b64d8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 17 Feb 2022 11:35:34 +0100
Subject: [PATCH 027/123] Delay loading of mecab in Korean tokenizer (#10295)

* Delay loading of mecab in Korean tokenizer

Delay loading of mecab until the tokenizer is called the first time so
that it's possible to initialize a blank `ko` pipeline without having
mecab installed, e.g. for use with `spacy init vectors`.

* Move mecab import back to __init__

Move mecab import back to __init__ to warn users at the same point as
before for missing python dependencies.
---
 spacy/lang/ko/__init__.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index 05fc67e79..eb3c2e1f5 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -31,15 +31,24 @@ def create_tokenizer():
 class KoreanTokenizer(DummyTokenizer):
     def __init__(self, vocab: Vocab):
         self.vocab = vocab
-        MeCab = try_mecab_import()  # type: ignore[func-returns-value]
-        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
+        self._mecab = try_mecab_import()  # type: ignore[func-returns-value]
+        self._mecab_tokenizer = None
+
+    @property
+    def mecab_tokenizer(self):
+        # This is a property so that initializing a pipeline with blank:ko is
+        # possible without actually requiring mecab-ko, e.g. to run
+        # `spacy init vectors ko` for a pipeline that will have a different
+        # tokenizer in the end. The languages need to match for the vectors
+        # to be imported and there's no way to pass a custom config to
+        # `init vectors`.
+        if self._mecab_tokenizer is None:
+            self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
+        return self._mecab_tokenizer
 
     def __reduce__(self):
         return KoreanTokenizer, (self.vocab,)
 
-    def __del__(self):
-        self.mecab_tokenizer.__del__()
-
     def __call__(self, text: str) -> Doc:
         dtokens = list(self.detailed_tokens(text))
         surfaces = [dt["surface"] for dt in dtokens]
@@ -90,7 +99,8 @@ def try_mecab_import() -> None:
         return MeCab
     except ImportError:
         raise ImportError(
-            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
+            "The Korean tokenizer (\"spacy.ko.KoreanTokenizer\") requires "
+            "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
             "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
             "and [natto-py](https://github.com/buruzaemon/natto-py)"
         ) from None

From a9756963e67cff6be5445ae441263f889c629123 Mon Sep 17 00:00:00 2001
From: Grey Murav <65895033+gremur@users.noreply.github.com>
Date: Thu, 17 Feb 2022 17:48:50 +0300
Subject: [PATCH 028/123] Extend list of abbreviations for ru language (#10282)

* Extend list of abbreviations for ru language

Extended list of abbreviations for ru language those may have influence on tokenization.

* black formatting

Co-authored-by: thomashacker <EdwardSchmuhl@web.de>
---
 spacy/lang/ru/tokenizer_exceptions.py | 347 +++++++++++++++++++++++++-
 1 file changed, 341 insertions(+), 6 deletions(-)

diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py
index 1dc363fae..f3756e26c 100644
--- a/spacy/lang/ru/tokenizer_exceptions.py
+++ b/spacy/lang/ru/tokenizer_exceptions.py
@@ -2,7 +2,6 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH, NORM
 from ...util import update_exc
 
-
 _exc = {}
 
 _abbrev_exc = [
@@ -42,7 +41,6 @@ _abbrev_exc = [
     {ORTH: "дек", NORM: "декабрь"},
 ]
 
-
 for abbrev_desc in _abbrev_exc:
     abbrev = abbrev_desc[ORTH]
     for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
@@ -50,17 +48,354 @@ for abbrev_desc in _abbrev_exc:
         _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
 
 
-_slang_exc = [
+for abbr in [
+    # Year slang abbreviations
     {ORTH: "2к15", NORM: "2015"},
     {ORTH: "2к16", NORM: "2016"},
     {ORTH: "2к17", NORM: "2017"},
     {ORTH: "2к18", NORM: "2018"},
     {ORTH: "2к19", NORM: "2019"},
     {ORTH: "2к20", NORM: "2020"},
-]
+    {ORTH: "2к21", NORM: "2021"},
+    {ORTH: "2к22", NORM: "2022"},
+    {ORTH: "2к23", NORM: "2023"},
+    {ORTH: "2к24", NORM: "2024"},
+    {ORTH: "2к25", NORM: "2025"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
 
-for slang_desc in _slang_exc:
-    _exc[slang_desc[ORTH]] = [slang_desc]
+for abbr in [
+    # Profession and academic titles abbreviations
+    {ORTH: "ак.", NORM: "академик"},
+    {ORTH: "акад.", NORM: "академик"},
+    {ORTH: "д-р архитектуры", NORM: "доктор архитектуры"},
+    {ORTH: "д-р биол. наук", NORM: "доктор биологических наук"},
+    {ORTH: "д-р ветеринар. наук", NORM: "доктор ветеринарных наук"},
+    {ORTH: "д-р воен. наук", NORM: "доктор военных наук"},
+    {ORTH: "д-р геогр. наук", NORM: "доктор географических наук"},
+    {ORTH: "д-р геол.-минерал. наук", NORM: "доктор геолого-минералогических наук"},
+    {ORTH: "д-р искусствоведения", NORM: "доктор искусствоведения"},
+    {ORTH: "д-р ист. наук", NORM: "доктор исторических наук"},
+    {ORTH: "д-р культурологии", NORM: "доктор культурологии"},
+    {ORTH: "д-р мед. наук", NORM: "доктор медицинских наук"},
+    {ORTH: "д-р пед. наук", NORM: "доктор педагогических наук"},
+    {ORTH: "д-р полит. наук", NORM: "доктор политических наук"},
+    {ORTH: "д-р психол. наук", NORM: "доктор психологических наук"},
+    {ORTH: "д-р с.-х. наук", NORM: "доктор сельскохозяйственных наук"},
+    {ORTH: "д-р социол. наук", NORM: "доктор социологических наук"},
+    {ORTH: "д-р техн. наук", NORM: "доктор технических наук"},
+    {ORTH: "д-р фармацевт. наук", NORM: "доктор фармацевтических наук"},
+    {ORTH: "д-р физ.-мат. наук", NORM: "доктор физико-математических наук"},
+    {ORTH: "д-р филол. наук", NORM: "доктор филологических наук"},
+    {ORTH: "д-р филос. наук", NORM: "доктор философских наук"},
+    {ORTH: "д-р хим. наук", NORM: "доктор химических наук"},
+    {ORTH: "д-р экон. наук", NORM: "доктор экономических наук"},
+    {ORTH: "д-р юрид. наук", NORM: "доктор юридических наук"},
+    {ORTH: "д-р", NORM: "доктор"},
+    {ORTH: "д.б.н.", NORM: "доктор биологических наук"},
+    {ORTH: "д.г.-м.н.", NORM: "доктор геолого-минералогических наук"},
+    {ORTH: "д.г.н.", NORM: "доктор географических наук"},
+    {ORTH: "д.и.н.", NORM: "доктор исторических наук"},
+    {ORTH: "д.иск.", NORM: "доктор искусствоведения"},
+    {ORTH: "д.м.н.", NORM: "доктор медицинских наук"},
+    {ORTH: "д.п.н.", NORM: "доктор психологических наук"},
+    {ORTH: "д.пед.н.", NORM: "доктор педагогических наук"},
+    {ORTH: "д.полит.н.", NORM: "доктор политических наук"},
+    {ORTH: "д.с.-х.н.", NORM: "доктор сельскохозяйственных наук"},
+    {ORTH: "д.социол.н.", NORM: "доктор социологических наук"},
+    {ORTH: "д.т.н.", NORM: "доктор технических наук"},
+    {ORTH: "д.т.н", NORM: "доктор технических наук"},
+    {ORTH: "д.ф.-м.н.", NORM: "доктор физико-математических наук"},
+    {ORTH: "д.ф.н.", NORM: "доктор филологических наук"},
+    {ORTH: "д.филос.н.", NORM: "доктор философских наук"},
+    {ORTH: "д.фил.н.", NORM: "доктор филологических наук"},
+    {ORTH: "д.х.н.", NORM: "доктор химических наук"},
+    {ORTH: "д.э.н.", NORM: "доктор экономических наук"},
+    {ORTH: "д.э.н", NORM: "доктор экономических наук"},
+    {ORTH: "д.ю.н.", NORM: "доктор юридических наук"},
+    {ORTH: "доц.", NORM: "доцент"},
+    {ORTH: "и.о.", NORM: "исполняющий обязанности"},
+    {ORTH: "к.б.н.", NORM: "кандидат биологических наук"},
+    {ORTH: "к.воен.н.", NORM: "кандидат военных наук"},
+    {ORTH: "к.г.-м.н.", NORM: "кандидат геолого-минералогических наук"},
+    {ORTH: "к.г.н.", NORM: "кандидат географических наук"},
+    {ORTH: "к.геогр.н", NORM: "кандидат географических наук"},
+    {ORTH: "к.геогр.наук", NORM: "кандидат географических наук"},
+    {ORTH: "к.и.н.", NORM: "кандидат исторических наук"},
+    {ORTH: "к.иск.", NORM: "кандидат искусствоведения"},
+    {ORTH: "к.м.н.", NORM: "кандидат медицинских наук"},
+    {ORTH: "к.п.н.", NORM: "кандидат психологических наук"},
+    {ORTH: "к.псх.н.", NORM: "кандидат психологических наук"},
+    {ORTH: "к.пед.н.", NORM: "кандидат педагогических наук"},
+    {ORTH: "канд.пед.наук", NORM: "кандидат педагогических наук"},
+    {ORTH: "к.полит.н.", NORM: "кандидат политических наук"},
+    {ORTH: "к.с.-х.н.", NORM: "кандидат сельскохозяйственных наук"},
+    {ORTH: "к.социол.н.", NORM: "кандидат социологических наук"},
+    {ORTH: "к.с.н.", NORM: "кандидат социологических наук"},
+    {ORTH: "к.т.н.", NORM: "кандидат технических наук"},
+    {ORTH: "к.ф.-м.н.", NORM: "кандидат физико-математических наук"},
+    {ORTH: "к.ф.н.", NORM: "кандидат филологических наук"},
+    {ORTH: "к.фил.н.", NORM: "кандидат филологических наук"},
+    {ORTH: "к.филол.н", NORM: "кандидат филологических наук"},
+    {ORTH: "к.фарм.наук", NORM: "кандидат фармакологических наук"},
+    {ORTH: "к.фарм.н.", NORM: "кандидат фармакологических наук"},
+    {ORTH: "к.фарм.н", NORM: "кандидат фармакологических наук"},
+    {ORTH: "к.филос.наук", NORM: "кандидат философских наук"},
+    {ORTH: "к.филос.н.", NORM: "кандидат философских наук"},
+    {ORTH: "к.филос.н", NORM: "кандидат философских наук"},
+    {ORTH: "к.х.н.", NORM: "кандидат химических наук"},
+    {ORTH: "к.х.н", NORM: "кандидат химических наук"},
+    {ORTH: "к.э.н.", NORM: "кандидат экономических наук"},
+    {ORTH: "к.э.н", NORM: "кандидат экономических наук"},
+    {ORTH: "к.ю.н.", NORM: "кандидат юридических наук"},
+    {ORTH: "к.ю.н", NORM: "кандидат юридических наук"},
+    {ORTH: "канд. архитектуры", NORM: "кандидат архитектуры"},
+    {ORTH: "канд. биол. наук", NORM: "кандидат биологических наук"},
+    {ORTH: "канд. ветеринар. наук", NORM: "кандидат ветеринарных наук"},
+    {ORTH: "канд. воен. наук", NORM: "кандидат военных наук"},
+    {ORTH: "канд. геогр. наук", NORM: "кандидат географических наук"},
+    {ORTH: "канд. геол.-минерал. наук", NORM: "кандидат геолого-минералогических наук"},
+    {ORTH: "канд. искусствоведения", NORM: "кандидат искусствоведения"},
+    {ORTH: "канд. ист. наук", NORM: "кандидат исторических наук"},
+    {ORTH: "к.ист.н.", NORM: "кандидат исторических наук"},
+    {ORTH: "канд. культурологии", NORM: "кандидат культурологии"},
+    {ORTH: "канд. мед. наук", NORM: "кандидат медицинских наук"},
+    {ORTH: "канд. пед. наук", NORM: "кандидат педагогических наук"},
+    {ORTH: "канд. полит. наук", NORM: "кандидат политических наук"},
+    {ORTH: "канд. психол. наук", NORM: "кандидат психологических наук"},
+    {ORTH: "канд. с.-х. наук", NORM: "кандидат сельскохозяйственных наук"},
+    {ORTH: "канд. социол. наук", NORM: "кандидат социологических наук"},
+    {ORTH: "к.соц.наук", NORM: "кандидат социологических наук"},
+    {ORTH: "к.соц.н.", NORM: "кандидат социологических наук"},
+    {ORTH: "к.соц.н", NORM: "кандидат социологических наук"},
+    {ORTH: "канд. техн. наук", NORM: "кандидат технических наук"},
+    {ORTH: "канд. фармацевт. наук", NORM: "кандидат фармацевтических наук"},
+    {ORTH: "канд. физ.-мат. наук", NORM: "кандидат физико-математических наук"},
+    {ORTH: "канд. филол. наук", NORM: "кандидат филологических наук"},
+    {ORTH: "канд. филос. наук", NORM: "кандидат философских наук"},
+    {ORTH: "канд. хим. наук", NORM: "кандидат химических наук"},
+    {ORTH: "канд. экон. наук", NORM: "кандидат экономических наук"},
+    {ORTH: "канд. юрид. наук", NORM: "кандидат юридических наук"},
+    {ORTH: "в.н.с.", NORM: "ведущий научный сотрудник"},
+    {ORTH: "мл. науч. сотр.", NORM: "младший научный сотрудник"},
+    {ORTH: "м.н.с.", NORM: "младший научный сотрудник"},
+    {ORTH: "проф.", NORM: "профессор"},
+    {ORTH: "профессор.кафедры", NORM: "профессор кафедры"},
+    {ORTH: "ст. науч. сотр.", NORM: "старший научный сотрудник"},
+    {ORTH: "чл.-к.", NORM: "член корреспондент"},
+    {ORTH: "чл.-корр.", NORM: "член-корреспондент"},
+    {ORTH: "чл.-кор.", NORM: "член-корреспондент"},
+    {ORTH: "дир.", NORM: "директор"},
+    {ORTH: "зам. дир.", NORM: "заместитель директора"},
+    {ORTH: "зав. каф.", NORM: "заведующий кафедрой"},
+    {ORTH: "зав.кафедрой", NORM: "заведующий кафедрой"},
+    {ORTH: "зав. кафедрой", NORM: "заведующий кафедрой"},
+    {ORTH: "асп.", NORM: "аспирант"},
+    {ORTH: "гл. науч. сотр.", NORM: "главный научный сотрудник"},
+    {ORTH: "вед. науч. сотр.", NORM: "ведущий научный сотрудник"},
+    {ORTH: "науч. сотр.", NORM: "научный сотрудник"},
+    {ORTH: "к.м.с.", NORM: "кандидат в мастера спорта"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+
+for abbr in [
+    # Literary phrases abbreviations
+    {ORTH: "и т.д.", NORM: "и так далее"},
+    {ORTH: "и т.п.", NORM: "и тому подобное"},
+    {ORTH: "т.д.", NORM: "так далее"},
+    {ORTH: "т.п.", NORM: "тому подобное"},
+    {ORTH: "т.е.", NORM: "то есть"},
+    {ORTH: "т.к.", NORM: "так как"},
+    {ORTH: "в т.ч.", NORM: "в том числе"},
+    {ORTH: "и пр.", NORM: "и прочие"},
+    {ORTH: "и др.", NORM: "и другие"},
+    {ORTH: "т.н.", NORM: "так называемый"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+
+for abbr in [
+    # Appeal to a person abbreviations
+    {ORTH: "г-н", NORM: "господин"},
+    {ORTH: "г-да", NORM: "господа"},
+    {ORTH: "г-жа", NORM: "госпожа"},
+    {ORTH: "тов.", NORM: "товарищ"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+
+for abbr in [
+    # Time periods abbreviations
+    {ORTH: "до н.э.", NORM: "до нашей эры"},
+    {ORTH: "по н.в.", NORM: "по настоящее время"},
+    {ORTH: "в н.в.", NORM: "в настоящее время"},
+    {ORTH: "наст.", NORM: "настоящий"},
+    {ORTH: "наст. время", NORM: "настоящее время"},
+    {ORTH: "г.г.", NORM: "годы"},
+    {ORTH: "гг.", NORM: "годы"},
+    {ORTH: "т.г.", NORM: "текущий год"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+
+for abbr in [
+    # Address forming elements abbreviations
+    {ORTH: "респ.", NORM: "республика"},
+    {ORTH: "обл.", NORM: "область"},
+    {ORTH: "г.ф.з.", NORM: "город федерального значения"},
+    {ORTH: "а.обл.", NORM: "автономная область"},
+    {ORTH: "а.окр.", NORM: "автономный округ"},
+    {ORTH: "м.р-н", NORM: "муниципальный район"},
+    {ORTH: "г.о.", NORM: "городской округ"},
+    {ORTH: "г.п.", NORM: "городское поселение"},
+    {ORTH: "с.п.", NORM: "сельское поселение"},
+    {ORTH: "вн.р-н", NORM: "внутригородской район"},
+    {ORTH: "вн.тер.г.", NORM: "внутригородская территория города"},
+    {ORTH: "пос.", NORM: "поселение"},
+    {ORTH: "р-н", NORM: "район"},
+    {ORTH: "с/с", NORM: "сельсовет"},
+    {ORTH: "г.", NORM: "город"},
+    {ORTH: "п.г.т.", NORM: "поселок городского типа"},
+    {ORTH: "пгт.", NORM: "поселок городского типа"},
+    {ORTH: "р.п.", NORM: "рабочий поселок"},
+    {ORTH: "рп.", NORM: "рабочий поселок"},
+    {ORTH: "кп.", NORM: "курортный поселок"},
+    {ORTH: "гп.", NORM: "городской поселок"},
+    {ORTH: "п.", NORM: "поселок"},
+    {ORTH: "в-ки", NORM: "выселки"},
+    {ORTH: "г-к", NORM: "городок"},
+    {ORTH: "з-ка", NORM: "заимка"},
+    {ORTH: "п-к", NORM: "починок"},
+    {ORTH: "киш.", NORM: "кишлак"},
+    {ORTH: "п. ст. ", NORM: "поселок станция"},
+    {ORTH: "п. ж/д ст. ", NORM: "поселок при железнодорожной станции"},
+    {ORTH: "ж/д бл-ст", NORM: "железнодорожный блокпост"},
+    {ORTH: "ж/д б-ка", NORM: "железнодорожная будка"},
+    {ORTH: "ж/д в-ка", NORM: "железнодорожная ветка"},
+    {ORTH: "ж/д к-ма", NORM: "железнодорожная казарма"},
+    {ORTH: "ж/д к-т", NORM: "железнодорожный комбинат"},
+    {ORTH: "ж/д пл-ма", NORM: "железнодорожная платформа"},
+    {ORTH: "ж/д пл-ка", NORM: "железнодорожная площадка"},
+    {ORTH: "ж/д п.п.", NORM: "железнодорожный путевой пост"},
+    {ORTH: "ж/д о.п.", NORM: "железнодорожный остановочный пункт"},
+    {ORTH: "ж/д рзд.", NORM: "железнодорожный разъезд"},
+    {ORTH: "ж/д ст. ", NORM: "железнодорожная станция"},
+    {ORTH: "м-ко", NORM: "местечко"},
+    {ORTH: "д.", NORM: "деревня"},
+    {ORTH: "с.", NORM: "село"},
+    {ORTH: "сл.", NORM: "слобода"},
+    {ORTH: "ст. ", NORM: "станция"},
+    {ORTH: "ст-ца", NORM: "станица"},
+    {ORTH: "у.", NORM: "улус"},
+    {ORTH: "х.", NORM: "хутор"},
+    {ORTH: "рзд.", NORM: "разъезд"},
+    {ORTH: "зим.", NORM: "зимовье"},
+    {ORTH: "б-г", NORM: "берег"},
+    {ORTH: "ж/р", NORM: "жилой район"},
+    {ORTH: "кв-л", NORM: "квартал"},
+    {ORTH: "мкр.", NORM: "микрорайон"},
+    {ORTH: "ост-в", NORM: "остров"},
+    {ORTH: "платф.", NORM: "платформа"},
+    {ORTH: "п/р", NORM: "промышленный район"},
+    {ORTH: "р-н", NORM: "район"},
+    {ORTH: "тер.", NORM: "территория"},
+    {
+        ORTH: "тер. СНО",
+        NORM: "территория садоводческих некоммерческих объединений граждан",
+    },
+    {
+        ORTH: "тер. ОНО",
+        NORM: "территория огороднических некоммерческих объединений граждан",
+    },
+    {ORTH: "тер. ДНО", NORM: "территория дачных некоммерческих объединений граждан"},
+    {ORTH: "тер. СНТ", NORM: "территория садоводческих некоммерческих товариществ"},
+    {ORTH: "тер. ОНТ", NORM: "территория огороднических некоммерческих товариществ"},
+    {ORTH: "тер. ДНТ", NORM: "территория дачных некоммерческих товариществ"},
+    {ORTH: "тер. СПК", NORM: "территория садоводческих потребительских кооперативов"},
+    {ORTH: "тер. ОПК", NORM: "территория огороднических потребительских кооперативов"},
+    {ORTH: "тер. ДПК", NORM: "территория дачных потребительских кооперативов"},
+    {ORTH: "тер. СНП", NORM: "территория садоводческих некоммерческих партнерств"},
+    {ORTH: "тер. ОНП", NORM: "территория огороднических некоммерческих партнерств"},
+    {ORTH: "тер. ДНП", NORM: "территория дачных некоммерческих партнерств"},
+    {ORTH: "тер. ТСН", NORM: "территория товарищества собственников недвижимости"},
+    {ORTH: "тер. ГСК", NORM: "территория гаражно-строительного кооператива"},
+    {ORTH: "ус.", NORM: "усадьба"},
+    {ORTH: "тер.ф.х.", NORM: "территория фермерского хозяйства"},
+    {ORTH: "ю.", NORM: "юрты"},
+    {ORTH: "ал.", NORM: "аллея"},
+    {ORTH: "б-р", NORM: "бульвар"},
+    {ORTH: "взв.", NORM: "взвоз"},
+    {ORTH: "взд.", NORM: "въезд"},
+    {ORTH: "дор.", NORM: "дорога"},
+    {ORTH: "ззд.", NORM: "заезд"},
+    {ORTH: "км", NORM: "километр"},
+    {ORTH: "к-цо", NORM: "кольцо"},
+    {ORTH: "лн.", NORM: "линия"},
+    {ORTH: "мгстр.", NORM: "магистраль"},
+    {ORTH: "наб.", NORM: "набережная"},
+    {ORTH: "пер-д", NORM: "переезд"},
+    {ORTH: "пер.", NORM: "переулок"},
+    {ORTH: "пл-ка", NORM: "площадка"},
+    {ORTH: "пл.", NORM: "площадь"},
+    {ORTH: "пр-д", NORM: "проезд"},
+    {ORTH: "пр-к", NORM: "просек"},
+    {ORTH: "пр-ка", NORM: "просека"},
+    {ORTH: "пр-лок", NORM: "проселок"},
+    {ORTH: "пр-кт", NORM: "проспект"},
+    {ORTH: "проул.", NORM: "проулок"},
+    {ORTH: "рзд.", NORM: "разъезд"},
+    {ORTH: "ряд", NORM: "ряд(ы)"},
+    {ORTH: "с-р", NORM: "сквер"},
+    {ORTH: "с-к", NORM: "спуск"},
+    {ORTH: "сзд.", NORM: "съезд"},
+    {ORTH: "туп.", NORM: "тупик"},
+    {ORTH: "ул.", NORM: "улица"},
+    {ORTH: "ш.", NORM: "шоссе"},
+    {ORTH: "влд.", NORM: "владение"},
+    {ORTH: "г-ж", NORM: "гараж"},
+    {ORTH: "д.", NORM: "дом"},
+    {ORTH: "двлд.", NORM: "домовладение"},
+    {ORTH: "зд.", NORM: "здание"},
+    {ORTH: "з/у", NORM: "земельный участок"},
+    {ORTH: "кв.", NORM: "квартира"},
+    {ORTH: "ком.", NORM: "комната"},
+    {ORTH: "подв.", NORM: "подвал"},
+    {ORTH: "кот.", NORM: "котельная"},
+    {ORTH: "п-б", NORM: "погреб"},
+    {ORTH: "к.", NORM: "корпус"},
+    {ORTH: "ОНС", NORM: "объект незавершенного строительства"},
+    {ORTH: "оф.", NORM: "офис"},
+    {ORTH: "пав.", NORM: "павильон"},
+    {ORTH: "помещ.", NORM: "помещение"},
+    {ORTH: "раб.уч.", NORM: "рабочий участок"},
+    {ORTH: "скл.", NORM: "склад"},
+    {ORTH: "coop.", NORM: "сооружение"},
+    {ORTH: "стр.", NORM: "строение"},
+    {ORTH: "торг.зал", NORM: "торговый зал"},
+    {ORTH: "а/п", NORM: "аэропорт"},
+    {ORTH: "им.", NORM: "имени"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+
+for abbr in [
+    # Others abbreviations
+    {ORTH: "тыс.руб.", NORM: "тысяч рублей"},
+    {ORTH: "тыс.", NORM: "тысяч"},
+    {ORTH: "руб.", NORM: "рубль"},
+    {ORTH: "долл.", NORM: "доллар"},
+    {ORTH: "прим.", NORM: "примечание"},
+    {ORTH: "прим.ред.", NORM: "примечание редакции"},
+    {ORTH: "см. также", NORM: "смотри также"},
+    {ORTH: "кв.м.", NORM: "квадрантный метр"},
+    {ORTH: "м2", NORM: "квадрантный метр"},
+    {ORTH: "б/у", NORM: "бывший в употреблении"},
+    {ORTH: "сокр.", NORM: "сокращение"},
+    {ORTH: "чел.", NORM: "человек"},
+    {ORTH: "б.п.", NORM: "базисный пункт"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
 
 
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

From 23f06dc37f8b9e309028d4d8b3ef17f6daaad8e0 Mon Sep 17 00:00:00 2001
From: Grey Murav <65895033+gremur@users.noreply.github.com>
Date: Thu, 17 Feb 2022 17:50:08 +0300
Subject: [PATCH 029/123] Extend list of numbers for ru language (#10280)

* Extended list of numbers for ru language

Extended list of numbers with all forms and cases including short forms, slang variants and roman numerals.

* Update lex_attrs.py

* Update 'like_num' function with percentages

Added support for numbers with percentages like 12%, 1.2% and etc. to the  'like_num' function.

* black formatting

Co-authored-by: thomashacker <EdwardSchmuhl@web.de>
---
 spacy/lang/ru/lex_attrs.py | 153 ++++++++++++++++++++++++++-----------
 1 file changed, 108 insertions(+), 45 deletions(-)

diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py
index 7979c7ea6..90802cb9b 100644
--- a/spacy/lang/ru/lex_attrs.py
+++ b/spacy/lang/ru/lex_attrs.py
@@ -1,56 +1,119 @@
 from ...attrs import LIKE_NUM
 
 
-_num_words = [
-    "ноль",
-    "один",
-    "два",
-    "три",
-    "четыре",
-    "пять",
-    "шесть",
-    "семь",
-    "восемь",
-    "девять",
-    "десять",
-    "одиннадцать",
-    "двенадцать",
-    "тринадцать",
-    "четырнадцать",
-    "пятнадцать",
-    "шестнадцать",
-    "семнадцать",
-    "восемнадцать",
-    "девятнадцать",
-    "двадцать",
-    "тридцать",
-    "сорок",
-    "пятьдесят",
-    "шестьдесят",
-    "семьдесят",
-    "восемьдесят",
-    "девяносто",
-    "сто",
-    "двести",
-    "триста",
-    "четыреста",
-    "пятьсот",
-    "шестьсот",
-    "семьсот",
-    "восемьсот",
-    "девятьсот",
-    "тысяча",
-    "миллион",
-    "миллиард",
-    "триллион",
-    "квадриллион",
-    "квинтиллион",
-]
+_num_words = list(
+    set(
+        """
+ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми 
+
+один первого первому единица одного одному первой первом первый первым одним одном во-первых 
+
+два второго второму второй втором вторым двойка двумя двум двух во-вторых двое две двоих оба обе обеим обеими 
+обеих обоим обоими обоих 
+
+полтора полторы полутора 
+
+три третьего третьему третьем третьим третий тройка трешка трёшка трояк трёха треха тремя трем трех трое троих трёх 
+
+четыре четвертого четвертому четвертом четвертый четвертым четверка четырьмя четырем четырех четверо четырёх четверым 
+четверых 
+
+пять пятерочка пятерка пятого пятому пятом пятый пятым пятью пяти пятеро пятерых пятерыми 
+
+шесть шестерка шестого шестому шестой шестом шестым шестью шести шестеро шестерых 
+
+семь семерка седьмого седьмому седьмой седьмом седьмым семью семи семеро
+
+восемь восьмерка восьмого восьмому восемью восьмой восьмом восьмым восеми восьмером восьми восьмью 
+
+девять девятого девятому девятка девятом девятый девятым девятью девяти девятером вдевятером девятерых 
+
+десять десятого десятому десятка десятом десятый десятым десятью десяти десятером вдесятером 
+
+одиннадцать одиннадцатого одиннадцатому одиннадцатом одиннадцатый одиннадцатым одиннадцатью одиннадцати 
+
+двенадцать двенадцатого двенадцатому двенадцатом двенадцатый двенадцатым двенадцатью двенадцати 
+
+тринадцать тринадцатого тринадцатому тринадцатом тринадцатый тринадцатым тринадцатью тринадцати 
+
+четырнадцать четырнадцатого четырнадцатому четырнадцатом четырнадцатый четырнадцатым четырнадцатью четырнадцати 
+
+пятнадцать пятнадцатого пятнадцатому пятнадцатом пятнадцатый пятнадцатым пятнадцатью пятнадцати 
+
+шестнадцать шестнадцатого шестнадцатому шестнадцатом шестнадцатый шестнадцатым шестнадцатью шестнадцати 
+
+семнадцать семнадцатого семнадцатому семнадцатом семнадцатый семнадцатым семнадцатью семнадцати 
+
+восемнадцать восемнадцатого восемнадцатому восемнадцатом восемнадцатый восемнадцатым восемнадцатью восемнадцати 
+
+девятнадцать девятнадцатого девятнадцатому девятнадцатом девятнадцатый девятнадцатым девятнадцатью девятнадцати 
+
+двадцать двадцатого двадцатому двадцатом двадцатый двадцатым двадцатью двадцати 
+
+тридцать тридцатого тридцатому тридцатом тридцатый тридцатым тридцатью тридцати 
+
+тридевять
+
+сорок сорокового сороковому сороковом сороковым сороковой 
+
+пятьдесят пятьдесятого пятьдесятому пятьюдесятью пятьдесятом пятьдесятый пятьдесятым пятидесяти полтинник 
+
+шестьдесят шестьдесятого шестьдесятому шестьюдесятью шестьдесятом шестьдесятый шестьдесятым шестидесятые шестидесяти 
+
+семьдесят семьдесятого семьдесятому семьюдесятью семьдесятом семьдесятый семьдесятым семидесяти 
+
+восемьдесят восемьдесятого восемьдесятому восемьюдесятью восемьдесятом восемьдесятый восемьдесятым восемидесяти 
+восьмидесяти 
+
+девяносто девяностого девяностому девяностом девяностый девяностым девяноста 
+
+сто сотого сотому сотка сотня сотом сотен сотый сотым ста 
+
+двести двумястами двухсотого двухсотому двухсотом двухсотый двухсотым двумстам двухстах двухсот 
+
+триста тремястами трехсотого трехсотому трехсотом трехсотый трехсотым тремстам трехстах трехсот 
+
+четыреста четырехсотого четырехсотому четырьмястами четырехсотом четырехсотый четырехсотым четыремстам четырехстах 
+четырехсот 
+
+пятьсот пятисотого пятисотому пятьюстами пятисотом пятисотый пятисотым пятистам пятистах пятисот 
+
+шестьсот шестисотого шестисотому шестьюстами шестисотом шестисотый шестисотым шестистам шестистах шестисот 
+
+семьсот семисотого семисотому семьюстами семисотом семисотый семисотым семистам семистах семисот 
+
+восемьсот восемисотого восемисотому восемисотом восемисотый восемисотым восьмистами восьмистам восьмистах восьмисот 
+
+девятьсот девятисотого девятисотому девятьюстами девятисотом девятисотый девятисотым девятистам девятистах девятисот 
+
+тысяча тысячного тысячному тысячном тысячный тысячным тысячам тысячах тысячей тысяч тысячи тыс 
+
+миллион миллионного миллионов миллионному миллионном миллионный миллионным миллионом миллиона миллионе миллиону 
+миллионов лям млн 
+
+миллиард миллиардного миллиардному миллиардном миллиардный миллиардным миллиардом миллиарда миллиарде миллиарду 
+миллиардов лярд млрд 
+
+триллион триллионного триллионному триллионном триллионный триллионным триллионом триллиона триллионе триллиону 
+триллионов трлн 
+
+квадриллион квадриллионного квадриллионному квадриллионный квадриллионным квадриллионом квадриллиона квадриллионе 
+квадриллиону квадриллионов квадрлн
+
+квинтиллион квинтиллионного квинтиллионному квинтиллионный квинтиллионным квинтиллионом квинтиллиона квинтиллионе 
+квинтиллиону квинтиллионов квинтлн
+
+i ii iii iv vi vii viii ix xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix
+""".split()
+    )
+)
 
 
 def like_num(text):
     if text.startswith(("+", "-", "±", "~")):
         text = text[1:]
+    if text.endswith("%"):
+        text = text[:-1]
     text = text.replace(",", "").replace(".", "")
     if text.isdigit():
         return True

From aa93b471a1cadb661c063dee4913ad8f2e492d48 Mon Sep 17 00:00:00 2001
From: Grey Murav <65895033+gremur@users.noreply.github.com>
Date: Thu, 17 Feb 2022 17:51:15 +0300
Subject: [PATCH 030/123] Extend list of stopwords for ru language (#10313)

---
 spacy/lang/ru/stop_words.py | 105 ++++++++++++++++++++++++++++--------
 1 file changed, 82 insertions(+), 23 deletions(-)

diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py
index 16cb55ef9..d6ea6b42a 100644
--- a/spacy/lang/ru/stop_words.py
+++ b/spacy/lang/ru/stop_words.py
@@ -1,52 +1,111 @@
 STOP_WORDS = set(
     """
-а
+а авось ага агу аж ай али алло ау ах ая
 
-будем будет будете будешь буду будут будучи будь будьте бы был была были было
-быть
+б будем будет будете будешь буду будут будучи будь будьте бы был была были было
+быть бац без безусловно бишь благо благодаря ближайшие близко более больше
+будто бывает бывала бывали бываю бывают бытует
 
 в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею
-всея всю вся вы
+всея всю вся вы ваш ваша ваше ваши вдали вдобавок вдруг ведь везде вернее
+взаимно взаправду видно вишь включая вместо внакладе вначале вне вниз внизу
+вновь вовсе возможно воистину вокруг вон вообще вопреки вперекор вплоть
+вполне вправду вправе впрочем впрямь вресноту вроде вряд всегда всюду
+всякий всякого всякой всячески вчеред
 
-да для до
+г го где гораздо гав
 
-его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею
+д да для до дабы давайте давно давным даже далее далеко дальше данная
+данного данное данной данном данному данные данный данных дану данунах
+даром де действительно довольно доколе доколь долго должен должна
+должно должны должный дополнительно другая другие другим другими
+других другое другой
 
-же
+е его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею едва
+ежели еле
 
-за
+ж же
 
-и из или им ими имъ их
+з за затем зато зачем здесь значит зря
+
+и из или им ими имъ их ибо иль имеет имел имела имело именно иметь иначе
+иногда иным иными итак ишь
+
+й
 
 к как кем ко когда кого ком кому комья которая которого которое которой котором
-которому которою которую которые который которым которыми которых кто
+которому которою которую которые который которым которыми которых кто ка кабы
+каждая каждое каждые каждый кажется казалась казались казалось казался казаться
+какая какие каким какими каков какого какой какому какою касательно кой коли
+коль конечно короче кроме кстати ку куда
 
-меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего
+л ли либо лишь любая любого любое любой любом любую любыми любых
+
+м меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего
 моей моем моём моему моею можем может можете можешь мои мой моим моими моих
-мочь мою моя мы
+мочь мою моя мы мало меж между менее меньше мимо многие много многого многое
+многом многому можно мол му
 
-на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим
+н на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим
 нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но
+наверняка наверху навряд навыворот над надо назад наиболее наизворот
+наизнанку наипаче накануне наконец наоборот наперед наперекор наподобие
+например напротив напрямую насилу настоящая настоящее настоящие настоящий
+насчет нате находиться начала начале неважно негде недавно недалеко незачем
+некем некогда некому некоторая некоторые некоторый некоторых некто некуда
+нельзя немногие немногим немного необходимо необходимости необходимые
+необходимым неоткуда непрерывно нередко несколько нету неужели нечего
+нечем нечему нечто нешто нибудь нигде ниже низко никак никакой никем
+никогда никого никому никто никуда ниоткуда нипочем ничего ничем ничему
+ничто ну нужная нужно нужного нужные нужный нужных ныне нынешнее нынешней
+нынешних нынче
 
 о об один одна одни одним одними одних одно одного одной одном одному одною
-одну он она оне они оно от
+одну он она оне они оно от оба общую обычно ого однажды однако ой около оный
+оп опять особенно особо особую особые откуда отнелижа отнелиже отовсюду
+отсюда оттого оттот оттуда отчего отчему ох очевидно очень ом
 
-по при
+п по при паче перед под подавно поди подобная подобно подобного подобные
+подобный подобным подобных поелику пожалуй пожалуйста позже поистине
+пока покамест поколе поколь покуда покудова помимо понеже поприще пор
+пора посему поскольку после посреди посредством потом потому потомушта
+похожем почему почти поэтому прежде притом причем про просто прочего
+прочее прочему прочими проще прям пусть
+
+р ради разве ранее рано раньше рядом
 
 с сам сама сами самим самими самих само самого самом самому саму свое своё
 своего своей своем своём своему своею свои свой своим своими своих свою своя
-себе себя собой собою
+себе себя собой собою самая самое самой самый самых сверх свыше се сего сей
+сейчас сие сих сквозь сколько скорее скоро следует слишком смогут сможет
+сначала снова со собственно совсем сперва спокону спустя сразу среди сродни
+стал стала стали стало стать суть сызнова
 
-та так такая такие таким такими таких такого такое такой таком такому такою
-такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому
-тот тою ту ты
+та то ту ты ти так такая такие таким такими таких такого такое такой таком такому такою
+такую те тебе тебя тем теми тех тобой тобою того той только том томах тому
+тот тою также таки таков такова там твои твоим твоих твой твоя твоё
+теперь тогда тоже тотчас точно туда тут тьфу тая
 
-у уже
+у уже увы уж ура ух ую
 
-чего чем чём чему что чтобы
+ф фу
 
-эта эти этим этими этих это этого этой этом этому этот этою эту
+х ха хе хорошо хотел хотела хотелось хотеть хоть хотя хочешь хочу хуже
 
-я
+ч чего чем чём чему что чтобы часто чаще чей через чтоб чуть чхать чьим
+чьих чьё чё
+
+ш ша
+
+щ ща щас
+
+ы ых ые ый
+
+э эта эти этим этими этих это этого этой этом этому этот этою эту эдак эдакий
+эй эка экий этак этакий эх
+
+ю
+
+я явно явных яко якобы якоже
 """.split()
 )

From 28ba31e793cf0a59e5ce14bd2e8f5c5d6e785ca2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 17 Feb 2022 15:54:09 +0100
Subject: [PATCH 031/123] Add whitespace and combined augmenters (#10170)

Add whitespace augmenter that inserts a single whitespace token into a
doc containing annotation used in core trained pipelines.

Add a combined augmenter that handles lowercasing, orth variants and
whitespace augmentation.
---
 spacy/tests/training/test_augmenters.py |  87 ++++++++-
 spacy/training/augment.py               | 224 +++++++++++++++++++++---
 2 files changed, 288 insertions(+), 23 deletions(-)

diff --git a/spacy/tests/training/test_augmenters.py b/spacy/tests/training/test_augmenters.py
index 43a78e4b0..e3639c5da 100644
--- a/spacy/tests/training/test_augmenters.py
+++ b/spacy/tests/training/test_augmenters.py
@@ -1,9 +1,11 @@
 import pytest
-from spacy.training import Corpus
+from spacy.pipeline._parser_internals.nonproj import contains_cycle
+from spacy.training import Corpus, Example
 from spacy.training.augment import create_orth_variants_augmenter
 from spacy.training.augment import create_lower_casing_augmenter
+from spacy.training.augment import make_whitespace_variant
 from spacy.lang.en import English
-from spacy.tokens import DocBin, Doc
+from spacy.tokens import DocBin, Doc, Span
 from contextlib import contextmanager
 import random
 
@@ -153,3 +155,84 @@ def test_custom_data_augmentation(nlp, doc):
     ents = [(e.start, e.end, e.label) for e in doc.ents]
     assert [(e.start, e.end, e.label) for e in corpus[0].reference.ents] == ents
     assert [(e.start, e.end, e.label) for e in corpus[1].reference.ents] == ents
+
+
+def test_make_whitespace_variant(nlp):
+    # fmt: off
+    text = "They flew to New York City.\nThen they drove to Washington, D.C."
+    words = ["They", "flew", "to", "New", "York", "City", ".", "\n", "Then", "they", "drove", "to", "Washington", ",", "D.C."]
+    spaces = [True, True, True, True, True, False, False, False, True, True, True, True, False, True, False]
+    tags = ["PRP", "VBD", "IN", "NNP", "NNP", "NNP", ".", "_SP", "RB", "PRP", "VBD", "IN", "NNP", ",", "NNP"]
+    lemmas = ["they", "fly", "to", "New", "York", "City", ".", "\n", "then", "they", "drive", "to", "Washington", ",", "D.C."]
+    heads = [1, 1, 1, 4, 5, 2, 1, 10, 10, 10, 10, 10, 11, 12, 12]
+    deps = ["nsubj", "ROOT", "prep", "compound", "compound", "pobj", "punct", "dep", "advmod", "nsubj", "ROOT", "prep", "pobj", "punct", "appos"]
+    ents = ["O", "O", "O", "B-GPE", "I-GPE", "I-GPE", "O", "O", "O", "O", "O", "O", "B-GPE", "O", "B-GPE"]
+    # fmt: on
+    doc = Doc(
+        nlp.vocab,
+        words=words,
+        spaces=spaces,
+        tags=tags,
+        lemmas=lemmas,
+        heads=heads,
+        deps=deps,
+        ents=ents,
+    )
+    assert doc.text == text
+    example = Example(nlp.make_doc(text), doc)
+    # whitespace is only added internally in entity spans
+    mod_ex = make_whitespace_variant(nlp, example, " ", 3)
+    assert mod_ex.reference.ents[0].text == "New York City"
+    mod_ex = make_whitespace_variant(nlp, example, " ", 4)
+    assert mod_ex.reference.ents[0].text == "New  York City"
+    mod_ex = make_whitespace_variant(nlp, example, " ", 5)
+    assert mod_ex.reference.ents[0].text == "New York  City"
+    mod_ex = make_whitespace_variant(nlp, example, " ", 6)
+    assert mod_ex.reference.ents[0].text == "New York City"
+    # add a space at every possible position
+    for i in range(len(doc) + 1):
+        mod_ex = make_whitespace_variant(nlp, example, " ", i)
+        assert mod_ex.reference[i].is_space
+        # adds annotation when the doc contains at least partial annotation
+        assert [t.tag_ for t in mod_ex.reference] == tags[:i] + ["_SP"] + tags[i:]
+        assert [t.lemma_ for t in mod_ex.reference] == lemmas[:i] + [" "] + lemmas[i:]
+        assert [t.dep_ for t in mod_ex.reference] == deps[:i] + ["dep"] + deps[i:]
+        # does not add partial annotation if doc does not contain this feature
+        assert not mod_ex.reference.has_annotation("POS")
+        assert not mod_ex.reference.has_annotation("MORPH")
+        # produces well-formed trees
+        assert not contains_cycle([t.head.i for t in mod_ex.reference])
+        assert len(list(doc.sents)) == 2
+        if i == 0:
+            assert mod_ex.reference[i].head.i == 1
+        else:
+            assert mod_ex.reference[i].head.i == i - 1
+        # adding another space also produces well-formed trees
+        for j in (3, 8, 10):
+            mod_ex2 = make_whitespace_variant(nlp, mod_ex, "\t\t\n", j)
+            assert not contains_cycle([t.head.i for t in mod_ex2.reference])
+            assert len(list(doc.sents)) == 2
+            assert mod_ex2.reference[j].head.i == j - 1
+        # entities are well-formed
+        assert len(doc.ents) == len(mod_ex.reference.ents)
+        for ent in mod_ex.reference.ents:
+            assert not ent[0].is_space
+            assert not ent[-1].is_space
+
+    # no modifications if:
+    # partial dependencies
+    example.reference[0].dep_ = ""
+    mod_ex = make_whitespace_variant(nlp, example, " ", 5)
+    assert mod_ex.text == example.reference.text
+    example.reference[0].dep_ = "nsubj"  # reset
+
+    # spans
+    example.reference.spans["spans"] = [example.reference[0:5]]
+    mod_ex = make_whitespace_variant(nlp, example, " ", 5)
+    assert mod_ex.text == example.reference.text
+    del example.reference.spans["spans"]  # reset
+
+    # links
+    example.reference.ents = [Span(doc, 0, 2, label="ENT", kb_id="Q123")]
+    mod_ex = make_whitespace_variant(nlp, example, " ", 5)
+    assert mod_ex.text == example.reference.text
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 63b54034c..59a39c7ee 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -1,4 +1,5 @@
 from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING
+from typing import Optional
 import random
 import itertools
 from functools import partial
@@ -11,32 +12,87 @@ if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
 
 
-class OrthVariantsSingle(BaseModel):
-    tags: List[StrictStr]
-    variants: List[StrictStr]
+@registry.augmenters("spacy.combined_augmenter.v1")
+def create_combined_augmenter(
+    lower_level: float,
+    orth_level: float,
+    orth_variants: Optional[Dict[str, List[Dict]]],
+    whitespace_level: float,
+    whitespace_per_token: float,
+    whitespace_variants: Optional[List[str]],
+) -> Callable[["Language", Example], Iterator[Example]]:
+    """Create a data augmentation callback that uses orth-variant replacement.
+    The callback can be added to a corpus or other data iterator during training.
+
+    lower_level (float): The percentage of texts that will be lowercased.
+    orth_level (float): The percentage of texts that will be augmented.
+    orth_variants (Optional[Dict[str, List[Dict]]]): A dictionary containing the
+        single and paired orth variants. Typically loaded from a JSON file.
+    whitespace_level (float): The percentage of texts that will have whitespace
+        tokens inserted.
+    whitespace_per_token (float): The number of whitespace tokens to insert in
+        the modified doc as a percentage of the doc length.
+    whitespace_variants (Optional[List[str]]): The whitespace token texts.
+    RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter.
+    """
+    return partial(
+        combined_augmenter,
+        lower_level=lower_level,
+        orth_level=orth_level,
+        orth_variants=orth_variants,
+        whitespace_level=whitespace_level,
+        whitespace_per_token=whitespace_per_token,
+        whitespace_variants=whitespace_variants,
+    )
 
 
-class OrthVariantsPaired(BaseModel):
-    tags: List[StrictStr]
-    variants: List[List[StrictStr]]
-
-
-class OrthVariants(BaseModel):
-    paired: List[OrthVariantsPaired] = []
-    single: List[OrthVariantsSingle] = []
+def combined_augmenter(
+    nlp: "Language",
+    example: Example,
+    *,
+    lower_level: float = 0.0,
+    orth_level: float = 0.0,
+    orth_variants: Optional[Dict[str, List[Dict]]] = None,
+    whitespace_level: float = 0.0,
+    whitespace_per_token: float = 0.0,
+    whitespace_variants: Optional[List[str]] = None,
+) -> Iterator[Example]:
+    if random.random() < lower_level:
+        example = make_lowercase_variant(nlp, example)
+    if orth_variants and random.random() < orth_level:
+        raw_text = example.text
+        orig_dict = example.to_dict()
+        variant_text, variant_token_annot = make_orth_variants(
+            nlp,
+            raw_text,
+            orig_dict["token_annotation"],
+            orth_variants,
+            lower=False,
+        )
+        orig_dict["token_annotation"] = variant_token_annot
+        example = example.from_dict(nlp.make_doc(variant_text), orig_dict)
+    if whitespace_variants and random.random() < whitespace_level:
+        for _ in range(int(len(example.reference) * whitespace_per_token)):
+            example = make_whitespace_variant(
+                nlp,
+                example,
+                random.choice(whitespace_variants),
+                random.randrange(0, len(example.reference)),
+            )
+    yield example
 
 
 @registry.augmenters("spacy.orth_variants.v1")
 def create_orth_variants_augmenter(
-    level: float, lower: float, orth_variants: OrthVariants
+    level: float, lower: float, orth_variants: Dict[str, List[Dict]]
 ) -> Callable[["Language", Example], Iterator[Example]]:
     """Create a data augmentation callback that uses orth-variant replacement.
     The callback can be added to a corpus or other data iterator during training.
 
     level (float): The percentage of texts that will be augmented.
     lower (float): The percentage of texts that will be lowercased.
-    orth_variants (Dict[str, dict]): A dictionary containing the single and
-        paired orth variants. Typically loaded from a JSON file.
+    orth_variants (Dict[str, List[Dict]]): A dictionary containing
+        the single and paired orth variants. Typically loaded from a JSON file.
     RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter.
     """
     return partial(
@@ -67,16 +123,20 @@ def lower_casing_augmenter(
     if random.random() >= level:
         yield example
     else:
-        example_dict = example.to_dict()
-        doc = nlp.make_doc(example.text.lower())
-        example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference]
-        yield example.from_dict(doc, example_dict)
+        yield make_lowercase_variant(nlp, example)
+
+
+def make_lowercase_variant(nlp: "Language", example: Example):
+    example_dict = example.to_dict()
+    doc = nlp.make_doc(example.text.lower())
+    example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference]
+    return example.from_dict(doc, example_dict)
 
 
 def orth_variants_augmenter(
     nlp: "Language",
     example: Example,
-    orth_variants: Dict,
+    orth_variants: Dict[str, List[Dict]],
     *,
     level: float = 0.0,
     lower: float = 0.0,
@@ -148,10 +208,132 @@ def make_orth_variants(
                             pair_idx = pair.index(words[word_idx])
                 words[word_idx] = punct_choices[punct_idx][pair_idx]
     token_dict["ORTH"] = words
-    # construct modified raw text from words and spaces
+    raw = construct_modified_raw_text(token_dict)
+    return raw, token_dict
+
+
+def make_whitespace_variant(
+    nlp: "Language",
+    example: Example,
+    whitespace: str,
+    position: int,
+) -> Example:
+    """Insert the whitespace token at the specified token offset in the doc.
+    This is primarily intended for v2-compatible training data that doesn't
+    include links or spans. If the document includes links, spans, or partial
+    dependency annotation, it is returned without modifications.
+
+    The augmentation follows the basics of the v2 space attachment policy, but
+    without a distinction between "real" and other tokens, so space tokens
+    may be attached to space tokens:
+    - at the beginning of a sentence attach the space token to the following
+      token
+    - otherwise attach the space token to the preceding token
+
+    The augmenter does not attempt to consolidate adjacent whitespace in the
+    same way that the tokenizer would.
+
+    The following annotation is used for the space token:
+    TAG: "_SP"
+    MORPH: ""
+    POS: "SPACE"
+    LEMMA: ORTH
+    DEP: "dep"
+    SENT_START: False
+
+    The annotation for each attribute is only set for the space token if there
+    is already at least partial annotation for that attribute in the original
+    example.
+
+    RETURNS (Example): Example with one additional space token.
+    """
+    example_dict = example.to_dict()
+    doc_dict = example_dict.get("doc_annotation", {})
+    token_dict = example_dict.get("token_annotation", {})
+    # returned unmodified if:
+    # - doc is empty
+    # - words are not defined
+    # - links are defined (only character-based offsets, which is more a quirk
+    #   of Example.to_dict than a technical constraint)
+    # - spans are defined
+    # - there are partial dependencies
+    if (
+        len(example.reference) == 0
+        or "ORTH" not in token_dict
+        or len(doc_dict.get("links", [])) > 0
+        or len(example.reference.spans) > 0
+        or (
+            example.reference.has_annotation("DEP")
+            and not example.reference.has_annotation("DEP", require_complete=True)
+        )
+    ):
+        return example
+    words = token_dict.get("ORTH", [])
+    length = len(words)
+    assert 0 <= position <= length
+    if example.reference.has_annotation("ENT_TYPE"):
+        # I-ENTITY if between B/I-ENTITY and I/L-ENTITY otherwise O
+        entity = "O"
+        if position > 1 and position < length:
+            ent_prev = doc_dict["entities"][position - 1]
+            ent_next = doc_dict["entities"][position]
+            if "-" in ent_prev and "-" in ent_next:
+                ent_iob_prev = ent_prev.split("-")[0]
+                ent_type_prev = ent_prev.split("-", 1)[1]
+                ent_iob_next = ent_next.split("-")[0]
+                ent_type_next = ent_next.split("-", 1)[1]
+                if (
+                    ent_iob_prev in ("B", "I")
+                    and ent_iob_next in ("I", "L")
+                    and ent_type_prev == ent_type_next
+                ):
+                    entity = f"I-{ent_type_prev}"
+        doc_dict["entities"].insert(position, entity)
+    else:
+        del doc_dict["entities"]
+    token_dict["ORTH"].insert(position, whitespace)
+    token_dict["SPACY"].insert(position, False)
+    if example.reference.has_annotation("TAG"):
+        token_dict["TAG"].insert(position, "_SP")
+    else:
+        del token_dict["TAG"]
+    if example.reference.has_annotation("LEMMA"):
+        token_dict["LEMMA"].insert(position, whitespace)
+    else:
+        del token_dict["LEMMA"]
+    if example.reference.has_annotation("POS"):
+        token_dict["POS"].insert(position, "SPACE")
+    else:
+        del token_dict["POS"]
+    if example.reference.has_annotation("MORPH"):
+        token_dict["MORPH"].insert(position, "")
+    else:
+        del token_dict["MORPH"]
+    if example.reference.has_annotation("DEP", require_complete=True):
+        if position == 0:
+            token_dict["HEAD"].insert(position, 0)
+        else:
+            token_dict["HEAD"].insert(position, position - 1)
+        for i in range(len(token_dict["HEAD"])):
+            if token_dict["HEAD"][i] >= position:
+                token_dict["HEAD"][i] += 1
+        token_dict["DEP"].insert(position, "dep")
+    else:
+        del token_dict["HEAD"]
+        del token_dict["DEP"]
+    if example.reference.has_annotation("SENT_START"):
+        token_dict["SENT_START"].insert(position, False)
+    else:
+        del token_dict["SENT_START"]
+    raw = construct_modified_raw_text(token_dict)
+    return Example.from_dict(nlp.make_doc(raw), example_dict)
+
+
+def construct_modified_raw_text(token_dict):
+    """Construct modified raw text from words and spaces."""
     raw = ""
     for orth, spacy in zip(token_dict["ORTH"], token_dict["SPACY"]):
         raw += orth
         if spacy:
             raw += " "
-    return raw, token_dict
+    return raw

From 6de84c8757f0779f3bc90edabc6789f6b24b05a5 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 21 Feb 2022 09:15:42 +0100
Subject: [PATCH 032/123] Auto-format code with black (#10333)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 spacy/lang/fr/syntax_iterators.py       |  4 +---
 spacy/lang/ko/__init__.py               |  2 +-
 spacy/pipeline/textcat.py               |  2 +-
 spacy/tests/lang/fi/test_noun_chunks.py | 19 +++++++++++++++++--
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 5f7ba5c10..5849c40b3 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -64,9 +64,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
             prev_end = right_end.i
 
             left_index = word.left_edge.i
-            left_index = (
-                left_index + 1 if word.left_edge.pos == adp_pos else left_index
-            )
+            left_index = left_index + 1 if word.left_edge.pos == adp_pos else left_index
 
             yield left_index, right_end.i + 1, np_label
         elif word.dep == conj_label:
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index eb3c2e1f5..a03f7821a 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -99,7 +99,7 @@ def try_mecab_import() -> None:
         return MeCab
     except ImportError:
         raise ImportError(
-            "The Korean tokenizer (\"spacy.ko.KoreanTokenizer\") requires "
+            'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
             "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
             "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
             "and [natto-py](https://github.com/buruzaemon/natto-py)"
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 64a452a7a..690c350fa 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -320,7 +320,7 @@ class TextCategorizer(TrainablePipe):
         self._validate_categories(examples)
         truths, not_missing = self._examples_to_truth(examples)
         not_missing = self.model.ops.asarray(not_missing)  # type: ignore
-        d_scores = (scores - truths)
+        d_scores = scores - truths
         d_scores *= not_missing
         mean_square_error = (d_scores**2).mean()
         return float(mean_square_error), d_scores
diff --git a/spacy/tests/lang/fi/test_noun_chunks.py b/spacy/tests/lang/fi/test_noun_chunks.py
index cc3b5aa36..cab84b311 100644
--- a/spacy/tests/lang/fi/test_noun_chunks.py
+++ b/spacy/tests/lang/fi/test_noun_chunks.py
@@ -107,7 +107,17 @@ FI_NP_TEST_EXAMPLES = [
     (
         "New York tunnetaan kaupunkina, joka ei koskaan nuku",
         ["PROPN", "PROPN", "VERB", "NOUN", "PUNCT", "PRON", "AUX", "ADV", "VERB"],
-        ["obj", "flat:name", "ROOT", "obl", "punct", "nsubj", "aux", "advmod", "acl:relcl"],
+        [
+            "obj",
+            "flat:name",
+            "ROOT",
+            "obl",
+            "punct",
+            "nsubj",
+            "aux",
+            "advmod",
+            "acl:relcl",
+        ],
         [2, -1, 0, -1, 4, 3, 2, 1, -5],
         ["New York", "kaupunkina"],
     ),
@@ -130,7 +140,12 @@ FI_NP_TEST_EXAMPLES = [
         ["NOUN", "VERB", "NOUN", "NOUN", "ADJ", "NOUN"],
         ["nsubj", "ROOT", "obj", "obl", "amod", "obl"],
         [1, 0, -1, -1, 1, -3],
-        ["sairaanhoitopiirit", "leikkaustoimintaa", "alueellaan", "useammassa sairaalassa"],
+        [
+            "sairaanhoitopiirit",
+            "leikkaustoimintaa",
+            "alueellaan",
+            "useammassa sairaalassa",
+        ],
     ),
     (
         "Lain mukaan varhaiskasvatus on suunnitelmallista toimintaa",

From f4c74764b84c5b5e7628392875b8d2def8bb07d5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Feb 2022 10:22:36 +0100
Subject: [PATCH 033/123] Fix Tok2Vec for empty batches (#10324)

* Add test for tok2vec with vectors and empty docs

* Add shortcut for empty batch in Tok2Vec.predict

* Avoid types
---
 spacy/pipeline/tok2vec.py            |  4 ++++
 spacy/tests/pipeline/test_tok2vec.py | 23 +++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index cb601e5dc..2e3dde3cb 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):
 
         DOCS: https://spacy.io/api/tok2vec#predict
         """
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            width = self.model.get_dim("nO")
+            return [self.model.ops.alloc((0, width)) for doc in docs]
         tokvecs = self.model.predict(docs)
         batch_id = Tok2VecListener.get_batch_id(docs)
         for listener in self.listeners:
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index eeea906bb..a5ac85e1e 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -11,7 +11,7 @@ from spacy.lang.en import English
 from thinc.api import Config, get_current_ops
 from numpy.testing import assert_array_equal
 
-from ..util import get_batch, make_tempdir
+from ..util import get_batch, make_tempdir, add_vecs_to_vocab
 
 
 def test_empty_doc():
@@ -140,9 +140,25 @@ TRAIN_DATA = [
 ]
 
 
-def test_tok2vec_listener():
+@pytest.mark.parametrize("with_vectors", (False, True))
+def test_tok2vec_listener(with_vectors):
     orig_config = Config().from_str(cfg_string)
+    orig_config["components"]["tok2vec"]["model"]["embed"][
+        "include_static_vectors"
+    ] = with_vectors
     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+
+    if with_vectors:
+        ops = get_current_ops()
+        vectors = [
+            ("apple", ops.asarray([1, 2, 3])),
+            ("orange", ops.asarray([-1, -2, -3])),
+            ("and", ops.asarray([-1, -1, -1])),
+            ("juice", ops.asarray([5, 5, 10])),
+            ("pie", ops.asarray([7, 6.3, 8.9])),
+        ]
+        add_vecs_to_vocab(nlp.vocab, vectors)
+
     assert nlp.pipe_names == ["tok2vec", "tagger"]
     tagger = nlp.get_pipe("tagger")
     tok2vec = nlp.get_pipe("tok2vec")
@@ -169,6 +185,9 @@ def test_tok2vec_listener():
     ops = get_current_ops()
     assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))
 
+    # test with empty doc
+    doc = nlp("")
+
     # TODO: should this warn or error?
     nlp.select_pipes(disable="tok2vec")
     assert nlp.pipe_names == ["tagger"]

From 3358fb9bdd96792725461e346eb0c1a986322e15 Mon Sep 17 00:00:00 2001
From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com>
Date: Mon, 21 Feb 2022 04:24:15 -0500
Subject: [PATCH 034/123] Miscellaneous Minor SpanGroups/DocBin Improvements
 (#10250)

* MultiHashEmbed vector docs correction

* doc copy span test

* ignore empty lists in DocBin.span_groups

* serialized empty list const + SpanGroups.is_empty

* add conditional deserial on from_bytes

* clean up + reorganize

* rm test

* add constant as class attribute

* rename to _EMPTY_BYTES

* Update spacy/tests/doc/test_span.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/doc/test_span.py  | 13 +++++++++++++
 spacy/tokens/_dict_proxies.py |  7 ++++++-
 spacy/tokens/_serialize.py    |  3 ++-
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index bdf34c1c1..c0496cabf 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -655,3 +655,16 @@ def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with
 def test_span_sents_not_parsed(doc_not_parsed):
     with pytest.raises(ValueError):
         list(Span(doc_not_parsed, 0, 3).sents)
+
+
+def test_span_group_copy(doc):
+    doc.spans["test"] = [doc[0:1], doc[2:4]]
+    assert len(doc.spans["test"]) == 2
+    doc_copy = doc.copy()
+    # check that the spans were indeed copied
+    assert len(doc_copy.spans["test"]) == 2
+    # add a new span to the original doc
+    doc.spans["test"].append(doc[3:4])
+    assert len(doc.spans["test"]) == 3
+    # check that the copy spans were not modified and this is an isolated doc
+    assert len(doc_copy.spans["test"]) == 2
diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py
index 470d3430f..8643243fa 100644
--- a/spacy/tokens/_dict_proxies.py
+++ b/spacy/tokens/_dict_proxies.py
@@ -6,6 +6,7 @@ import srsly
 from .span_group import SpanGroup
 from ..errors import Errors
 
+
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
     from .doc import Doc  # noqa: F401
@@ -19,6 +20,8 @@ if TYPE_CHECKING:
 class SpanGroups(UserDict):
     """A dict-like proxy held by the Doc, to control access to span groups."""
 
+    _EMPTY_BYTES = srsly.msgpack_dumps([])
+
     def __init__(
         self, doc: "Doc", items: Iterable[Tuple[str, SpanGroup]] = tuple()
     ) -> None:
@@ -43,11 +46,13 @@ class SpanGroups(UserDict):
     def to_bytes(self) -> bytes:
         # We don't need to serialize this as a dict, because the groups
         # know their names.
+        if len(self) == 0:
+            return self._EMPTY_BYTES
         msg = [value.to_bytes() for value in self.values()]
         return srsly.msgpack_dumps(msg)
 
     def from_bytes(self, bytes_data: bytes) -> "SpanGroups":
-        msg = srsly.msgpack_loads(bytes_data)
+        msg = [] if bytes_data == self._EMPTY_BYTES else srsly.msgpack_loads(bytes_data)
         self.clear()
         doc = self._ensure_doc()
         for value_bytes in msg:
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index bd2bdb811..2b72adb4d 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -12,6 +12,7 @@ from ..compat import copy_reg
 from ..attrs import SPACY, ORTH, intify_attr, IDS
 from ..errors import Errors
 from ..util import ensure_path, SimpleFrozenList
+from ._dict_proxies import SpanGroups
 
 # fmt: off
 ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
@@ -146,7 +147,7 @@ class DocBin:
             doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)  # type: ignore
             doc = doc.from_array(self.attrs, tokens)  # type: ignore
             doc.cats = self.cats[i]
-            if self.span_groups[i]:
+            if self.span_groups[i] != SpanGroups._EMPTY_BYTES:
                 doc.spans.from_bytes(self.span_groups[i])
             else:
                 doc.spans.clear()

From f32ee2e533c709c8f2cc00b9cce28b779f4a0304 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Feb 2022 10:24:52 +0100
Subject: [PATCH 035/123] Fix NER check in CoNLL-U converter (#10302)

* Fix NER check in CoNLL-U converter

Leave ents unset if no NER annotation is found in the MISC column.

* Revert to global rather than per-sentence NER check

* Update spacy/training/converters/conllu_to_docs.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/test_cli.py                     |  8 ++++++--
 spacy/training/converters/conllu_to_docs.py | 12 +++++++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index fc35ff86e..ec512b839 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -34,7 +34,7 @@ from .util import make_tempdir
 
 
 @pytest.mark.issue(4665)
-def test_issue4665():
+def test_cli_converters_conllu_empty_heads_ner():
     """
     conllu_to_docs should not raise an exception if the HEAD column contains an
     underscore
@@ -59,7 +59,11 @@ def test_issue4665():
 17	.	_	PUNCT	.	_	_	punct	_	_
 18	]	_	PUNCT	-RRB-	_	_	punct	_	_
 """
-    conllu_to_docs(input_data)
+    docs = list(conllu_to_docs(input_data))
+    # heads are all 0
+    assert not all([t.head.i for t in docs[0]])
+    # NER is unset
+    assert not docs[0].has_annotation("ENT_IOB")
 
 
 @pytest.mark.issue(4924)
diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py
index 7a4f44d3b..a4e70b01f 100644
--- a/spacy/training/converters/conllu_to_docs.py
+++ b/spacy/training/converters/conllu_to_docs.py
@@ -71,6 +71,7 @@ def read_conllx(
 ):
     """Yield docs, one for each sentence"""
     vocab = Vocab()  # need vocab to make a minimal Doc
+    set_ents = has_ner(input_data, ner_tag_pattern)
     for sent in input_data.strip().split("\n\n"):
         lines = sent.strip().split("\n")
         if lines:
@@ -83,6 +84,7 @@ def read_conllx(
                 merge_subtokens=merge_subtokens,
                 append_morphology=append_morphology,
                 ner_map=ner_map,
+                set_ents=set_ents,
             )
             yield doc
 
@@ -133,6 +135,7 @@ def conllu_sentence_to_doc(
     merge_subtokens=False,
     append_morphology=False,
     ner_map=None,
+    set_ents=False,
 ):
     """Create an Example from the lines for one CoNLL-U sentence, merging
     subtokens and appending morphology to tags if required.
@@ -214,8 +217,10 @@ def conllu_sentence_to_doc(
         doc[i]._.merged_morph = morphs[i]
         doc[i]._.merged_lemma = lemmas[i]
         doc[i]._.merged_spaceafter = spaces[i]
-    ents = get_entities(lines, ner_tag_pattern, ner_map)
-    doc.ents = biluo_tags_to_spans(doc, ents)
+    ents = None
+    if set_ents:
+        ents = get_entities(lines, ner_tag_pattern, ner_map)
+        doc.ents = biluo_tags_to_spans(doc, ents)
 
     if merge_subtokens:
         doc = merge_conllu_subtokens(lines, doc)
@@ -247,7 +252,8 @@ def conllu_sentence_to_doc(
         deps=deps,
         heads=heads,
     )
-    doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
+    if set_ents:
+        doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
 
     return doc_x
 

From 30030176ee066e2de92238802d7af9d6120d689f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Feb 2022 10:26:19 +0100
Subject: [PATCH 036/123] Update Korean defaults for Tokenizer (#10322)

Update Korean defaults for `Tokenizer` for tokenization following UD
Korean Kaist.
---
 spacy/lang/ko/__init__.py             |  2 ++
 spacy/lang/ko/punctuation.py          | 12 ++++++++++++
 spacy/tests/conftest.py               | 13 +++++++++++++
 spacy/tests/lang/ko/test_tokenizer.py | 20 ++++++++++++++++++++
 4 files changed, 47 insertions(+)
 create mode 100644 spacy/lang/ko/punctuation.py

diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index a03f7821a..63bc06665 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -1,5 +1,6 @@
 from typing import Iterator, Any, Dict
 
+from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
 from .lex_attrs import LEX_ATTRS
@@ -85,6 +86,7 @@ class KoreanDefaults(BaseDefaults):
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
     writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
+    infixes = TOKENIZER_INFIXES
 
 
 class Korean(Language):
diff --git a/spacy/lang/ko/punctuation.py b/spacy/lang/ko/punctuation.py
new file mode 100644
index 000000000..7f7b40c5b
--- /dev/null
+++ b/spacy/lang/ko/punctuation.py
@@ -0,0 +1,12 @@
+from ..char_classes import LIST_QUOTES
+from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
+
+
+_infixes = (
+    ["·", "ㆍ", "\(", "\)"]
+    + [r"(?<=[0-9])~(?=[0-9-])"]
+    + LIST_QUOTES
+    + BASE_TOKENIZER_INFIXES
+)
+
+TOKENIZER_INFIXES = _infixes
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index ee90a9f38..f9266cb94 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -227,6 +227,19 @@ def ko_tokenizer():
     return get_lang_class("ko")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def ko_tokenizer_tokenizer():
+    config = {
+        "nlp": {
+            "tokenizer": {
+                "@tokenizers": "spacy.Tokenizer.v1",
+            }
+        }
+    }
+    nlp = get_lang_class("ko").from_config(config)
+    return nlp.tokenizer
+
+
 @pytest.fixture(scope="session")
 def lb_tokenizer():
     return get_lang_class("lb")().tokenizer
diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py
index eac309857..e6b65dee9 100644
--- a/spacy/tests/lang/ko/test_tokenizer.py
+++ b/spacy/tests/lang/ko/test_tokenizer.py
@@ -47,3 +47,23 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
 def test_ko_empty_doc(ko_tokenizer):
     tokens = ko_tokenizer("")
     assert len(tokens) == 0
+
+
+# fmt: off
+SPACY_TOKENIZER_TESTS = [
+    ("있다.", "있다 ."),
+    ("'예'는", "' 예 ' 는"),
+    ("부 (富) 는", "부 ( 富 ) 는"),
+    ("부(富)는", "부 ( 富 ) 는"),
+    ("1982~1983.", "1982 ~ 1983 ."),
+    ("사과·배·복숭아·수박은 모두 과일이다.", "사과 · 배 · 복숭아 · 수박은 모두 과일이다 ."),
+    ("그렇구나~", "그렇구나~"),
+    ("『9시 반의 당구』,", "『 9시 반의 당구 』 ,"),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize("text,expected_tokens", SPACY_TOKENIZER_TESTS)
+def test_ko_spacy_tokenizer(ko_tokenizer_tokenizer, text, expected_tokens):
+    tokens = [token.text for token in ko_tokenizer_tokenizer(text)]
+    assert tokens == expected_tokens.split()

From cf5b46b63e91b9a2881c3a7d52bb9d2856c809f2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Feb 2022 10:22:36 +0100
Subject: [PATCH 037/123] Fix Tok2Vec for empty batches (#10324)

* Add test for tok2vec with vectors and empty docs

* Add shortcut for empty batch in Tok2Vec.predict

* Avoid types
---
 spacy/pipeline/tok2vec.py            |  4 ++++
 spacy/tests/pipeline/test_tok2vec.py | 23 +++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index cb601e5dc..2e3dde3cb 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):
 
         DOCS: https://spacy.io/api/tok2vec#predict
         """
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            width = self.model.get_dim("nO")
+            return [self.model.ops.alloc((0, width)) for doc in docs]
         tokvecs = self.model.predict(docs)
         batch_id = Tok2VecListener.get_batch_id(docs)
         for listener in self.listeners:
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index eeea906bb..a5ac85e1e 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -11,7 +11,7 @@ from spacy.lang.en import English
 from thinc.api import Config, get_current_ops
 from numpy.testing import assert_array_equal
 
-from ..util import get_batch, make_tempdir
+from ..util import get_batch, make_tempdir, add_vecs_to_vocab
 
 
 def test_empty_doc():
@@ -140,9 +140,25 @@ TRAIN_DATA = [
 ]
 
 
-def test_tok2vec_listener():
+@pytest.mark.parametrize("with_vectors", (False, True))
+def test_tok2vec_listener(with_vectors):
     orig_config = Config().from_str(cfg_string)
+    orig_config["components"]["tok2vec"]["model"]["embed"][
+        "include_static_vectors"
+    ] = with_vectors
     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+
+    if with_vectors:
+        ops = get_current_ops()
+        vectors = [
+            ("apple", ops.asarray([1, 2, 3])),
+            ("orange", ops.asarray([-1, -2, -3])),
+            ("and", ops.asarray([-1, -1, -1])),
+            ("juice", ops.asarray([5, 5, 10])),
+            ("pie", ops.asarray([7, 6.3, 8.9])),
+        ]
+        add_vecs_to_vocab(nlp.vocab, vectors)
+
     assert nlp.pipe_names == ["tok2vec", "tagger"]
     tagger = nlp.get_pipe("tagger")
     tok2vec = nlp.get_pipe("tok2vec")
@@ -169,6 +185,9 @@ def test_tok2vec_listener():
     ops = get_current_ops()
     assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))
 
+    # test with empty doc
+    doc = nlp("")
+
     # TODO: should this warn or error?
     nlp.select_pipes(disable="tok2vec")
     assert nlp.pipe_names == ["tagger"]

From 78a8bec4d0a0e607acd3f9a2c6eaafe54c7ca4ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 21 Feb 2022 15:02:21 +0100
Subject: [PATCH 038/123] Make core projectivization functions cdef nogil
 (#10241)

* Make core projectivization methods cdef nogil

While profiling the parser, I noticed that relatively a lot of time is
spent in projectivization. This change rewrites the functions in the
core loops as cdef nogil for efficiency.

In C++-land, we use vector in place of Python lists and absent heads
are represented as -1 in place of None.

* _heads_to_c: add assertion

Validation should be performed by the caller, but this assertion ensures that
we are not reading/writing out of bounds with incorrect input.
---
 spacy/pipeline/_parser_internals/nonproj.pyx | 83 +++++++++++++++-----
 spacy/tests/parser/test_nonproj.py           |  4 +-
 2 files changed, 66 insertions(+), 21 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx
index 82070cd27..36163fcc3 100644
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
 scheme.
 """
 from copy import copy
+from libc.limits cimport INT_MAX
+from libc.stdlib cimport abs
+from libcpp cimport bool
+from libcpp.vector cimport vector
 
 from ...tokens.doc cimport Doc, set_children_from_heads
 
@@ -41,13 +45,18 @@ def contains_cycle(heads):
 
 
 def is_nonproj_arc(tokenid, heads):
+    cdef vector[int] c_heads = _heads_to_c(heads)
+    return _is_nonproj_arc(tokenid, c_heads)
+
+
+cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
     # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
     # if there is a token k, h < k < d such that h is not
     # an ancestor of k. Same for h -> d, h > d
     head = heads[tokenid]
     if head == tokenid:  # root arcs cannot be non-projective
         return False
-    elif head is None:  # unattached tokens cannot be non-projective
+    elif head < 0:  # unattached tokens cannot be non-projective
         return False
     
     cdef int start, end
@@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads):
     else:
         start, end = (tokenid+1, head)
     for k in range(start, end):
-        for ancestor in ancestors(k, heads):
-            if ancestor is None:  # for unattached tokens/subtrees
-                break
-            elif ancestor == head:  # normal case: k dominated by h
-                break
+        if _has_head_as_ancestor(k, head, heads):
+            continue
         else:  # head not in ancestors: d -> h is non-projective
             return True
     return False
 
 
+cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil:
+    ancestor = tokenid
+    cnt = 0
+    while cnt < heads.size():
+        if heads[ancestor] == head or heads[ancestor] < 0:
+            return True
+        ancestor = heads[ancestor]
+        cnt += 1
+
+    return False
+
+
 def is_nonproj_tree(heads):
+    cdef vector[int] c_heads = _heads_to_c(heads)
     # a tree is non-projective if at least one arc is non-projective
-    return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
+    return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
 
 
 def decompose(label):
@@ -98,16 +117,31 @@ def projectivize(heads, labels):
     # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
     # which encode a projective and decorated tree.
     proj_heads = copy(heads)
-    smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
-    if smallest_np_arc is None:  # this sentence is already projective
+
+    cdef int new_head
+    cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
+    cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
+    if smallest_np_arc == -1:  # this sentence is already projective
         return proj_heads, copy(labels)
-    while smallest_np_arc is not None:
-        _lift(smallest_np_arc, proj_heads)
-        smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
+    while smallest_np_arc != -1:
+        new_head = _lift(smallest_np_arc, proj_heads)
+        c_proj_heads[smallest_np_arc] = new_head
+        smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
     deco_labels = _decorate(heads, proj_heads, labels)
     return proj_heads, deco_labels
 
 
+cdef vector[int] _heads_to_c(heads):
+    cdef vector[int] c_heads;
+    for head in heads:
+        if head == None:
+            c_heads.push_back(-1)
+        else:
+            assert head < len(heads)
+            c_heads.push_back(head)
+    return c_heads
+
+
 cpdef deprojectivize(Doc doc):
     # Reattach arcs with decorated labels (following HEAD scheme). For each
     # decorated arc X||Y, search top-down, left-to-right, breadth-first until
@@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels):
             deco_labels.append(labels[tokenid])
     return deco_labels
 
+def get_smallest_nonproj_arc_slow(heads):
+    cdef vector[int] c_heads = _heads_to_c(heads)
+    return _get_smallest_nonproj_arc(c_heads)
 
-def _get_smallest_nonproj_arc(heads):
+
+cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil:
     # return the smallest non-proj arc or None
     # where size is defined as the distance between dep and head
     # and ties are broken left to right
-    smallest_size = float('inf')
-    smallest_np_arc = None
-    for tokenid, head in enumerate(heads):
+    cdef int smallest_size = INT_MAX
+    cdef int smallest_np_arc = -1
+    cdef int size
+    cdef int tokenid
+    cdef int head
+
+    for tokenid in range(heads.size()):
+        head = heads[tokenid]
         size = abs(tokenid-head)
-        if size < smallest_size and is_nonproj_arc(tokenid, heads):
+        if size < smallest_size and _is_nonproj_arc(tokenid, heads):
             smallest_size = size
             smallest_np_arc = tokenid
     return smallest_np_arc
 
 
-def _lift(tokenid, heads):
+cpdef int _lift(tokenid, heads):
     # reattaches a word to it's grandfather
     head = heads[tokenid]
     ghead = heads[head]
+    cdef int new_head = ghead if head != ghead else tokenid
     # attach to ghead if head isn't attached to root else attach to root
-    heads[tokenid] = ghead if head != ghead else tokenid
+    heads[tokenid] = new_head
+    return new_head
 
 
 def _find_new_head(token, headlabel):
diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
index 3957e4d77..60d000c44 100644
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -93,8 +93,8 @@ def test_parser_pseudoprojectivity(en_vocab):
     assert nonproj.is_decorated("X") is False
     nonproj._lift(0, tree)
     assert tree == [2, 2, 2]
-    assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
-    assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
+    assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree) == 7
+    assert nonproj.get_smallest_nonproj_arc_slow(nonproj_tree2) == 10
     # fmt: off
     proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
     assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]

From 249b97184d12664dde53a3c5b8c658ad7b8cf0ca Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Wed, 23 Feb 2022 16:10:05 +0100
Subject: [PATCH 039/123] Bugfixes and test for rehearse (#10347)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fixing argument order for rehearse

* rehearse test for ner and tagger

* rehearse bugfix

* added test for parser

* test for multilabel textcat

* rehearse fix

* remove debug line

* Update spacy/tests/training/test_rehearse.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/tests/training/test_rehearse.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Kádár Ákos <akos@onyx.uvt.nl>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/language.py                     |   5 +-
 spacy/pipeline/tagger.pyx             |  11 +-
 spacy/pipeline/textcat.py             |   2 +-
 spacy/tests/training/test_rehearse.py | 168 ++++++++++++++++++++++++++
 4 files changed, 178 insertions(+), 8 deletions(-)
 create mode 100644 spacy/tests/training/test_rehearse.py

diff --git a/spacy/language.py b/spacy/language.py
index e8fd2720c..bab403f0e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1222,8 +1222,9 @@ class Language:
             component_cfg = {}
         grads = {}
 
-        def get_grads(W, dW, key=None):
+        def get_grads(key, W, dW):
             grads[key] = (W, dW)
+            return W, dW
 
         get_grads.learn_rate = sgd.learn_rate  # type: ignore[attr-defined, union-attr]
         get_grads.b1 = sgd.b1  # type: ignore[attr-defined, union-attr]
@@ -1236,7 +1237,7 @@ class Language:
                 examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
             )
         for key, (W, dW) in grads.items():
-            sgd(W, dW, key=key)  # type: ignore[call-arg, misc]
+            sgd(key, W, dW)  # type: ignore[call-arg, misc]
         return losses
 
     def begin_training(
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index a2bec888e..e21a9096e 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -225,6 +225,7 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
+        loss_func = SequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -236,12 +237,12 @@ class Tagger(TrainablePipe):
             # Handle cases where there are no tokens in any docs.
             return losses
         set_dropout_rate(self.model, drop)
-        guesses, backprop = self.model.begin_update(docs)
-        target = self._rehearsal_model(examples)
-        gradient = guesses - target
-        backprop(gradient)
+        tag_scores, bp_tag_scores = self.model.begin_update(docs)
+        tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
+        grads, loss = loss_func(tag_scores, tutor_tag_scores)
+        bp_tag_scores(grads)
         self.finish_update(sgd)
-        losses[self.name] += (gradient**2).sum()
+        losses[self.name] += loss
         return losses
 
     def get_loss(self, examples, scores):
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 690c350fa..bc3f127fc 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -283,7 +283,7 @@ class TextCategorizer(TrainablePipe):
             return losses
         set_dropout_rate(self.model, drop)
         scores, bp_scores = self.model.begin_update(docs)
-        target = self._rehearsal_model(examples)
+        target, _ = self._rehearsal_model.begin_update(docs)
         gradient = scores - target
         bp_scores(gradient)
         if sgd is not None:
diff --git a/spacy/tests/training/test_rehearse.py b/spacy/tests/training/test_rehearse.py
new file mode 100644
index 000000000..1bb8fac86
--- /dev/null
+++ b/spacy/tests/training/test_rehearse.py
@@ -0,0 +1,168 @@
+import pytest
+import spacy
+
+from typing import List
+from spacy.training import Example
+
+
+TRAIN_DATA = [
+    (
+        'Who is Kofi Annan?',
+        {
+            'entities': [(7, 18, 'PERSON')],
+            'tags': ['PRON', 'AUX', 'PROPN', 'PRON', 'PUNCT'],
+            'heads': [1, 1, 3, 1, 1],
+            'deps': ['attr', 'ROOT', 'compound', 'nsubj', 'punct'],
+            'morphs': ['', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Number=Sing', 'Number=Sing', 'PunctType=Peri'],
+            'cats': {'question': 1.0}
+        }
+    ),
+    (
+        'Who is Steve Jobs?',
+        {
+            'entities': [(7, 17, 'PERSON')],
+            'tags': ['PRON', 'AUX', 'PROPN', 'PRON', 'PUNCT'],
+            'heads': [1, 1, 3, 1, 1],
+            'deps': ['attr', 'ROOT', 'compound', 'nsubj', 'punct'],
+            'morphs': ['', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Number=Sing', 'Number=Sing', 'PunctType=Peri'],
+            'cats': {'question': 1.0}
+        }
+    ),
+    (
+        'Bob is a nice person.',
+        {
+            'entities': [(0, 3, 'PERSON')],
+            'tags': ['PROPN', 'AUX', 'DET', 'ADJ', 'NOUN', 'PUNCT'],
+            'heads': [1, 1, 4, 4, 1, 1],
+            'deps': ['nsubj', 'ROOT', 'det', 'amod', 'attr', 'punct'],
+            'morphs': ['Number=Sing', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Definite=Ind|PronType=Art', 'Degree=Pos', 'Number=Sing', 'PunctType=Peri'],
+            'cats': {'statement': 1.0}
+        },
+    ),
+    (
+        'Hi Anil, how are you?',
+        {
+            'entities': [(3, 7, 'PERSON')],
+            'tags': ['INTJ', 'PROPN', 'PUNCT', 'ADV', 'AUX', 'PRON', 'PUNCT'],
+            'deps': ['intj', 'npadvmod', 'punct', 'advmod', 'ROOT', 'nsubj', 'punct'],
+            'heads': [4, 0, 4, 4, 4, 4, 4],
+            'morphs': ['', 'Number=Sing', 'PunctType=Comm', '', 'Mood=Ind|Tense=Pres|VerbForm=Fin', 'Case=Nom|Person=2|PronType=Prs', 'PunctType=Peri'],
+            'cats': {'greeting': 1.0, 'question': 1.0}
+        }
+    ),
+    (
+        'I like London and Berlin.',
+        {
+            'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')],
+            'tags': ['PROPN', 'VERB', 'PROPN', 'CCONJ', 'PROPN', 'PUNCT'],
+            'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'],
+            'heads': [1, 1, 1, 2, 2, 1],
+            'morphs': ['Case=Nom|Number=Sing|Person=1|PronType=Prs', 'Tense=Pres|VerbForm=Fin', 'Number=Sing', 'ConjType=Cmp', 'Number=Sing', 'PunctType=Peri'],
+            'cats': {'statement': 1.0}
+        }
+    )
+]
+
+REHEARSE_DATA = [
+    (
+        'Hi Anil',
+        {
+            'entities': [(3, 7, 'PERSON')],
+            'tags': ['INTJ', 'PROPN'],
+            'deps': ['ROOT', 'npadvmod'],
+            'heads': [0, 0],
+            'morphs': ['', 'Number=Sing'],
+            'cats': {'greeting': 1.0}
+        }
+    ),
+    (
+        'Hi Ravish, how you doing?',
+        {
+            'entities': [(3, 9, 'PERSON')],
+            'tags': ['INTJ', 'PROPN', 'PUNCT', 'ADV', 'AUX', 'PRON', 'PUNCT'],
+            'deps': ['intj', 'ROOT', 'punct', 'advmod', 'nsubj', 'advcl', 'punct'],
+            'heads': [1, 1, 1, 5, 5, 1, 1],
+            'morphs': ['', 'VerbForm=Inf', 'PunctType=Comm', '', 'Case=Nom|Person=2|PronType=Prs', 'Aspect=Prog|Tense=Pres|VerbForm=Part', 'PunctType=Peri'],
+            'cats': {'greeting': 1.0, 'question': 1.0}
+        }
+    ),
+    # UTENSIL new label
+    (
+        'Natasha bought new forks.',
+        {
+            'entities': [(0, 7, 'PERSON'), (19, 24, 'UTENSIL')],
+            'tags': ['PROPN', 'VERB', 'ADJ', 'NOUN', 'PUNCT'],
+            'deps': ['nsubj', 'ROOT', 'amod', 'dobj', 'punct'],
+            'heads': [1, 1, 3, 1, 1],
+            'morphs': ['Number=Sing', 'Tense=Past|VerbForm=Fin', 'Degree=Pos', 'Number=Plur', 'PunctType=Peri'],
+            'cats': {'statement': 1.0}
+        }
+    )
+]
+
+
+def _add_ner_label(ner, data):
+    for _, annotations in data:
+        for ent in annotations['entities']:
+            ner.add_label(ent[2])
+
+
+def _add_tagger_label(tagger, data):
+    for _, annotations in data:
+        for tag in annotations['tags']:
+            tagger.add_label(tag)
+
+
+def _add_parser_label(parser, data):
+    for _, annotations in data:
+        for dep in annotations['deps']:
+            parser.add_label(dep)
+
+
+def _add_textcat_label(textcat, data):
+    for _, annotations in data:
+        for cat in annotations['cats']:
+            textcat.add_label(cat)
+
+
+def _optimize(
+        nlp,
+        component: str,
+        data: List,
+        rehearse: bool
+):
+    """Run either train or rehearse."""
+    pipe = nlp.get_pipe(component)
+    if component == 'ner':
+        _add_ner_label(pipe, data)
+    elif component == 'tagger':
+        _add_tagger_label(pipe, data)
+    elif component == 'parser':
+        _add_tagger_label(pipe, data)
+    elif component == 'textcat_multilabel':
+        _add_textcat_label(pipe, data)
+    else:
+        raise NotImplementedError
+
+    if rehearse:
+        optimizer = nlp.resume_training()
+    else:
+        optimizer = nlp.initialize()
+
+    for _ in range(5):
+        for text, annotation in data:
+            doc = nlp.make_doc(text)
+            example = Example.from_dict(doc, annotation)
+            if rehearse:
+                nlp.rehearse([example], sgd=optimizer)
+            else:
+                nlp.update([example], sgd=optimizer)
+    return nlp
+
+
+@pytest.mark.parametrize("component", ['ner', 'tagger', 'parser', 'textcat_multilabel'])
+def test_rehearse(component):
+    nlp = spacy.blank("en")
+    nlp.add_pipe(component)
+    nlp = _optimize(nlp, component, TRAIN_DATA, False)
+    _optimize(nlp, component, REHEARSE_DATA, True)

From b16da378bb584c4b2a12a4b944cb3141a4ec7789 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 23 Feb 2022 21:08:56 +0100
Subject: [PATCH 040/123] Re-remove universe tests from test suite (#10357)

---
 .gitignore                                 |  1 -
 setup.py                                   |  1 -
 spacy/tests/universe/test_universe_json.py | 17 -----------------
 3 files changed, 19 deletions(-)
 delete mode 100644 spacy/tests/universe/test_universe_json.py

diff --git a/.gitignore b/.gitignore
index 60036a475..ac72f2bbf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,6 @@ keys/
 spacy/tests/package/setup.cfg
 spacy/tests/package/pyproject.toml
 spacy/tests/package/requirements.txt
-spacy/tests/universe/universe.json
 
 # Website
 website/.cache/
diff --git a/setup.py b/setup.py
index 03a1e01dd..fcc124a43 100755
--- a/setup.py
+++ b/setup.py
@@ -81,7 +81,6 @@ COPY_FILES = {
     ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
     ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
     ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
-    ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
 }
 
 
diff --git a/spacy/tests/universe/test_universe_json.py b/spacy/tests/universe/test_universe_json.py
deleted file mode 100644
index 295889186..000000000
--- a/spacy/tests/universe/test_universe_json.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import json
-import re
-from pathlib import Path
-
-
-def test_universe_json():
-
-    root_dir = Path(__file__).parent
-    universe_file = root_dir / "universe.json"
-
-    with universe_file.open() as f:
-        universe_data = json.load(f)
-        for entry in universe_data["resources"]:
-            if "github" in entry:
-                assert not re.match(
-                    r"^(http:)|^(https:)", entry["github"]
-                ), "Github field should be user/repo, not a url"

From 5f568f7e41f5bba85ac7f135d3a2dfee3cb2e2b1 Mon Sep 17 00:00:00 2001
From: Sam Edwardes <edwardes.s@gmail.com>
Date: Wed, 23 Feb 2022 21:18:10 -0800
Subject: [PATCH 041/123] Updated spaCy universe for spacytextblob (#10335)

* Updated spacytextblob in universe.json

* Fixed json

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added spacy_version tag to spacytextblob

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/meta/universe.json | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 122281583..6374600f2 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -227,11 +227,11 @@
         },
         {
             "id": "spacy-textblob",
-            "title": "spaCyTextBlob",
-            "slogan": "Easy sentiment analysis for spaCy using TextBlob. Now supports spaCy 3.0!",
-            "thumb": "https://github.com/SamEdwardes/spaCyTextBlob/raw/main/website/static/img/logo-thumb-square-250x250.png",
-            "description": "spaCyTextBlob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extensions `._.polarity`, `._.subjectivity`, and `._.assessments` to `Doc`, `Span`, and `Token` objects. For spaCy 2 please use `pip install pip install spacytextblob==0.1.7`",
-            "github": "SamEdwardes/spaCyTextBlob",
+            "title": "spacytextblob",
+            "slogan": "A TextBlob sentiment analysis pipeline component for spaCy.",
+            "thumb": "https://github.com/SamEdwardes/spacytextblob/raw/main/docs/static/img/logo-thumb-square-250x250.png",
+            "description": "spacytextblob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extension `._.blob` to `Doc`, `Span`, and `Token` objects.",
+            "github": "SamEdwardes/spacytextblob",
             "pip": "spacytextblob",
             "code_example": [
                 "import spacy",
@@ -241,9 +241,10 @@
                 "nlp.add_pipe('spacytextblob')",
                 "text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'",
                 "doc = nlp(text)",
-                "doc._.polarity      # Polarity: -0.125",
-                "doc._.subjectivity  # Sujectivity: 0.9",
-                "doc._.assessments   # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]"
+                "doc._.blob.polarity                            # Polarity: -0.125",
+                "doc._.blob.subjectivity                        # Subjectivity: 0.9",
+                "doc._.blob.sentiment_assessments.assessments   # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]",
+                "doc._.blob.ngrams()                            # [WordList(['I', 'had', 'a']), WordList(['had', 'a', 'really']), WordList(['a', 'really', 'horrible']), WordList(['really', 'horrible', 'day']), WordList(['horrible', 'day', 'It']), WordList(['day', 'It', 'was']), WordList(['It', 'was', 'the']), WordList(['was', 'the', 'worst']), WordList(['the', 'worst', 'day']), WordList(['worst', 'day', 'ever']), WordList(['day', 'ever', 'But']), WordList(['ever', 'But', 'every']), WordList(['But', 'every', 'now']), WordList(['every', 'now', 'and']), WordList(['now', 'and', 'then']), WordList(['and', 'then', 'I']), WordList(['then', 'I', 'have']), WordList(['I', 'have', 'a']), WordList(['have', 'a', 'really']), WordList(['a', 'really', 'good']), WordList(['really', 'good', 'day']), WordList(['good', 'day', 'that']), WordList(['day', 'that', 'makes']), WordList(['that', 'makes', 'me']), WordList(['makes', 'me', 'happy'])]"
             ],
             "code_language": "python",
             "url": "https://spacytextblob.netlify.app/",
@@ -254,7 +255,8 @@
                 "website": "https://samedwardes.com"
             },
             "category": ["pipeline"],
-            "tags": ["sentiment", "textblob"]
+            "tags": ["sentiment", "textblob"],
+            "spacy_version": 3
         },
         {
             "id": "spacy-ray",

From d637b34e2f58199eb4cbb58634981334a5a17185 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 25 Feb 2022 10:00:21 +0100
Subject: [PATCH 042/123] Auto-format code with black (#10377)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 spacy/tests/training/test_rehearse.py       | 215 ++++++++++++--------
 spacy/training/converters/conllu_to_docs.py |   4 +-
 2 files changed, 132 insertions(+), 87 deletions(-)

diff --git a/spacy/tests/training/test_rehearse.py b/spacy/tests/training/test_rehearse.py
index 1bb8fac86..84c507702 100644
--- a/spacy/tests/training/test_rehearse.py
+++ b/spacy/tests/training/test_rehearse.py
@@ -7,139 +7,182 @@ from spacy.training import Example
 
 TRAIN_DATA = [
     (
-        'Who is Kofi Annan?',
+        "Who is Kofi Annan?",
         {
-            'entities': [(7, 18, 'PERSON')],
-            'tags': ['PRON', 'AUX', 'PROPN', 'PRON', 'PUNCT'],
-            'heads': [1, 1, 3, 1, 1],
-            'deps': ['attr', 'ROOT', 'compound', 'nsubj', 'punct'],
-            'morphs': ['', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Number=Sing', 'Number=Sing', 'PunctType=Peri'],
-            'cats': {'question': 1.0}
-        }
-    ),
-    (
-        'Who is Steve Jobs?',
-        {
-            'entities': [(7, 17, 'PERSON')],
-            'tags': ['PRON', 'AUX', 'PROPN', 'PRON', 'PUNCT'],
-            'heads': [1, 1, 3, 1, 1],
-            'deps': ['attr', 'ROOT', 'compound', 'nsubj', 'punct'],
-            'morphs': ['', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Number=Sing', 'Number=Sing', 'PunctType=Peri'],
-            'cats': {'question': 1.0}
-        }
-    ),
-    (
-        'Bob is a nice person.',
-        {
-            'entities': [(0, 3, 'PERSON')],
-            'tags': ['PROPN', 'AUX', 'DET', 'ADJ', 'NOUN', 'PUNCT'],
-            'heads': [1, 1, 4, 4, 1, 1],
-            'deps': ['nsubj', 'ROOT', 'det', 'amod', 'attr', 'punct'],
-            'morphs': ['Number=Sing', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 'Definite=Ind|PronType=Art', 'Degree=Pos', 'Number=Sing', 'PunctType=Peri'],
-            'cats': {'statement': 1.0}
+            "entities": [(7, 18, "PERSON")],
+            "tags": ["PRON", "AUX", "PROPN", "PRON", "PUNCT"],
+            "heads": [1, 1, 3, 1, 1],
+            "deps": ["attr", "ROOT", "compound", "nsubj", "punct"],
+            "morphs": [
+                "",
+                "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
+                "Number=Sing",
+                "Number=Sing",
+                "PunctType=Peri",
+            ],
+            "cats": {"question": 1.0},
         },
     ),
     (
-        'Hi Anil, how are you?',
+        "Who is Steve Jobs?",
         {
-            'entities': [(3, 7, 'PERSON')],
-            'tags': ['INTJ', 'PROPN', 'PUNCT', 'ADV', 'AUX', 'PRON', 'PUNCT'],
-            'deps': ['intj', 'npadvmod', 'punct', 'advmod', 'ROOT', 'nsubj', 'punct'],
-            'heads': [4, 0, 4, 4, 4, 4, 4],
-            'morphs': ['', 'Number=Sing', 'PunctType=Comm', '', 'Mood=Ind|Tense=Pres|VerbForm=Fin', 'Case=Nom|Person=2|PronType=Prs', 'PunctType=Peri'],
-            'cats': {'greeting': 1.0, 'question': 1.0}
-        }
+            "entities": [(7, 17, "PERSON")],
+            "tags": ["PRON", "AUX", "PROPN", "PRON", "PUNCT"],
+            "heads": [1, 1, 3, 1, 1],
+            "deps": ["attr", "ROOT", "compound", "nsubj", "punct"],
+            "morphs": [
+                "",
+                "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
+                "Number=Sing",
+                "Number=Sing",
+                "PunctType=Peri",
+            ],
+            "cats": {"question": 1.0},
+        },
     ),
     (
-        'I like London and Berlin.',
+        "Bob is a nice person.",
         {
-            'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')],
-            'tags': ['PROPN', 'VERB', 'PROPN', 'CCONJ', 'PROPN', 'PUNCT'],
-            'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'],
-            'heads': [1, 1, 1, 2, 2, 1],
-            'morphs': ['Case=Nom|Number=Sing|Person=1|PronType=Prs', 'Tense=Pres|VerbForm=Fin', 'Number=Sing', 'ConjType=Cmp', 'Number=Sing', 'PunctType=Peri'],
-            'cats': {'statement': 1.0}
-        }
-    )
+            "entities": [(0, 3, "PERSON")],
+            "tags": ["PROPN", "AUX", "DET", "ADJ", "NOUN", "PUNCT"],
+            "heads": [1, 1, 4, 4, 1, 1],
+            "deps": ["nsubj", "ROOT", "det", "amod", "attr", "punct"],
+            "morphs": [
+                "Number=Sing",
+                "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
+                "Definite=Ind|PronType=Art",
+                "Degree=Pos",
+                "Number=Sing",
+                "PunctType=Peri",
+            ],
+            "cats": {"statement": 1.0},
+        },
+    ),
+    (
+        "Hi Anil, how are you?",
+        {
+            "entities": [(3, 7, "PERSON")],
+            "tags": ["INTJ", "PROPN", "PUNCT", "ADV", "AUX", "PRON", "PUNCT"],
+            "deps": ["intj", "npadvmod", "punct", "advmod", "ROOT", "nsubj", "punct"],
+            "heads": [4, 0, 4, 4, 4, 4, 4],
+            "morphs": [
+                "",
+                "Number=Sing",
+                "PunctType=Comm",
+                "",
+                "Mood=Ind|Tense=Pres|VerbForm=Fin",
+                "Case=Nom|Person=2|PronType=Prs",
+                "PunctType=Peri",
+            ],
+            "cats": {"greeting": 1.0, "question": 1.0},
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "entities": [(7, 13, "LOC"), (18, 24, "LOC")],
+            "tags": ["PROPN", "VERB", "PROPN", "CCONJ", "PROPN", "PUNCT"],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+            "heads": [1, 1, 1, 2, 2, 1],
+            "morphs": [
+                "Case=Nom|Number=Sing|Person=1|PronType=Prs",
+                "Tense=Pres|VerbForm=Fin",
+                "Number=Sing",
+                "ConjType=Cmp",
+                "Number=Sing",
+                "PunctType=Peri",
+            ],
+            "cats": {"statement": 1.0},
+        },
+    ),
 ]
 
 REHEARSE_DATA = [
     (
-        'Hi Anil',
+        "Hi Anil",
         {
-            'entities': [(3, 7, 'PERSON')],
-            'tags': ['INTJ', 'PROPN'],
-            'deps': ['ROOT', 'npadvmod'],
-            'heads': [0, 0],
-            'morphs': ['', 'Number=Sing'],
-            'cats': {'greeting': 1.0}
-        }
+            "entities": [(3, 7, "PERSON")],
+            "tags": ["INTJ", "PROPN"],
+            "deps": ["ROOT", "npadvmod"],
+            "heads": [0, 0],
+            "morphs": ["", "Number=Sing"],
+            "cats": {"greeting": 1.0},
+        },
     ),
     (
-        'Hi Ravish, how you doing?',
+        "Hi Ravish, how you doing?",
         {
-            'entities': [(3, 9, 'PERSON')],
-            'tags': ['INTJ', 'PROPN', 'PUNCT', 'ADV', 'AUX', 'PRON', 'PUNCT'],
-            'deps': ['intj', 'ROOT', 'punct', 'advmod', 'nsubj', 'advcl', 'punct'],
-            'heads': [1, 1, 1, 5, 5, 1, 1],
-            'morphs': ['', 'VerbForm=Inf', 'PunctType=Comm', '', 'Case=Nom|Person=2|PronType=Prs', 'Aspect=Prog|Tense=Pres|VerbForm=Part', 'PunctType=Peri'],
-            'cats': {'greeting': 1.0, 'question': 1.0}
-        }
+            "entities": [(3, 9, "PERSON")],
+            "tags": ["INTJ", "PROPN", "PUNCT", "ADV", "AUX", "PRON", "PUNCT"],
+            "deps": ["intj", "ROOT", "punct", "advmod", "nsubj", "advcl", "punct"],
+            "heads": [1, 1, 1, 5, 5, 1, 1],
+            "morphs": [
+                "",
+                "VerbForm=Inf",
+                "PunctType=Comm",
+                "",
+                "Case=Nom|Person=2|PronType=Prs",
+                "Aspect=Prog|Tense=Pres|VerbForm=Part",
+                "PunctType=Peri",
+            ],
+            "cats": {"greeting": 1.0, "question": 1.0},
+        },
     ),
     # UTENSIL new label
     (
-        'Natasha bought new forks.',
+        "Natasha bought new forks.",
         {
-            'entities': [(0, 7, 'PERSON'), (19, 24, 'UTENSIL')],
-            'tags': ['PROPN', 'VERB', 'ADJ', 'NOUN', 'PUNCT'],
-            'deps': ['nsubj', 'ROOT', 'amod', 'dobj', 'punct'],
-            'heads': [1, 1, 3, 1, 1],
-            'morphs': ['Number=Sing', 'Tense=Past|VerbForm=Fin', 'Degree=Pos', 'Number=Plur', 'PunctType=Peri'],
-            'cats': {'statement': 1.0}
-        }
-    )
+            "entities": [(0, 7, "PERSON"), (19, 24, "UTENSIL")],
+            "tags": ["PROPN", "VERB", "ADJ", "NOUN", "PUNCT"],
+            "deps": ["nsubj", "ROOT", "amod", "dobj", "punct"],
+            "heads": [1, 1, 3, 1, 1],
+            "morphs": [
+                "Number=Sing",
+                "Tense=Past|VerbForm=Fin",
+                "Degree=Pos",
+                "Number=Plur",
+                "PunctType=Peri",
+            ],
+            "cats": {"statement": 1.0},
+        },
+    ),
 ]
 
 
 def _add_ner_label(ner, data):
     for _, annotations in data:
-        for ent in annotations['entities']:
+        for ent in annotations["entities"]:
             ner.add_label(ent[2])
 
 
 def _add_tagger_label(tagger, data):
     for _, annotations in data:
-        for tag in annotations['tags']:
+        for tag in annotations["tags"]:
             tagger.add_label(tag)
 
 
 def _add_parser_label(parser, data):
     for _, annotations in data:
-        for dep in annotations['deps']:
+        for dep in annotations["deps"]:
             parser.add_label(dep)
 
 
 def _add_textcat_label(textcat, data):
     for _, annotations in data:
-        for cat in annotations['cats']:
+        for cat in annotations["cats"]:
             textcat.add_label(cat)
 
 
-def _optimize(
-        nlp,
-        component: str,
-        data: List,
-        rehearse: bool
-):
+def _optimize(nlp, component: str, data: List, rehearse: bool):
     """Run either train or rehearse."""
     pipe = nlp.get_pipe(component)
-    if component == 'ner':
+    if component == "ner":
         _add_ner_label(pipe, data)
-    elif component == 'tagger':
+    elif component == "tagger":
         _add_tagger_label(pipe, data)
-    elif component == 'parser':
+    elif component == "parser":
         _add_tagger_label(pipe, data)
-    elif component == 'textcat_multilabel':
+    elif component == "textcat_multilabel":
         _add_textcat_label(pipe, data)
     else:
         raise NotImplementedError
@@ -160,7 +203,7 @@ def _optimize(
     return nlp
 
 
-@pytest.mark.parametrize("component", ['ner', 'tagger', 'parser', 'textcat_multilabel'])
+@pytest.mark.parametrize("component", ["ner", "tagger", "parser", "textcat_multilabel"])
 def test_rehearse(component):
     nlp = spacy.blank("en")
     nlp.add_pipe(component)
diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py
index a4e70b01f..7052504cc 100644
--- a/spacy/training/converters/conllu_to_docs.py
+++ b/spacy/training/converters/conllu_to_docs.py
@@ -253,7 +253,9 @@ def conllu_sentence_to_doc(
         heads=heads,
     )
     if set_ents:
-        doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
+        doc_x.ents = [
+            Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents
+        ]
 
     return doc_x
 

From 3f68bbcfec44ef55d101e6db742d353b72652129 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 25 Feb 2022 16:29:12 +0100
Subject: [PATCH 043/123] Clean up loggers docs (#10351)

* update docs to point to spacy-loggers docs

* remove unused error code
---
 spacy/errors.py               |  3 ---
 website/docs/api/legacy.md    | 21 ++-------------------
 website/docs/api/top-level.md |  2 +-
 3 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index b45c4f9db..5399e489b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -566,9 +566,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
             "a list of spans, with each span represented by a tuple (start_char, end_char). "
             "The tuple can be optionally extended with a label and a KB ID.")
-    E880 = ("The 'wandb' library could not be found - did you install it? "
-            "Alternatively, specify the 'ConsoleLogger' in the 'training.logger' "
-            "config section, instead of the 'WandbLogger'.")
     E884 = ("The pipeline could not be initialized because the vectors "
             "could not be found at '{vectors}'. If your pipeline was already "
             "initialized/trained before, call 'resume_training' instead of 'initialize', "
diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md
index 916a5bf7f..e24c37d77 100644
--- a/website/docs/api/legacy.md
+++ b/website/docs/api/legacy.md
@@ -248,23 +248,6 @@ the others, but may not be as accurate, especially if texts are short.
 
 ## Loggers {#loggers}
 
-These functions are available from `@spacy.registry.loggers`.
+Logging utilities for spaCy are implemented in the [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the functions are typically available from `@spacy.registry.loggers`.
 
-### spacy.WandbLogger.v1 {#WandbLogger_v1}
-
-The first version of the [`WandbLogger`](/api/top-level#WandbLogger) did not yet
-support the `log_dataset_dir` and `model_log_interval` arguments.
-
-> #### Example config
->
-> ```ini
-> [training.logger]
-> @loggers = "spacy.WandbLogger.v1"
-> project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
-> ```
->
-> | Name                   | Description                                                                                                                           |
-> | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-> | `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
-> | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              |
+More documentation can be found in that repo's [readme](https://github.com/explosion/spacy-loggers/blob/main/README.md) file.
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index be19f9c3a..1a3e9da46 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -423,7 +423,7 @@ and the accuracy scores on the development set.
 The built-in, default logger is the ConsoleLogger, which prints results to the
 console in tabular format. The
 [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
-a dependency of spaCy, enables other loggers: currently it provides one that
+a dependency of spaCy, enables other loggers, such as one that
 sends results to a [Weights & Biases](https://www.wandb.com/) dashboard.
 
 Instead of using one of the built-in loggers, you can

From 8e93fa850748c884c71505b4f26c46d0c98d3ba1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 1 Mar 2022 09:21:25 +0100
Subject: [PATCH 044/123] Fix Vectors.n_keys for floret vectors (#10394)

Fix `Vectors.n_keys` for floret vectors to match docstring description
and avoid W007 warnings in similarity methods.
---
 spacy/tests/vocab_vectors/test_vectors.py |  4 ++++
 spacy/vectors.pyx                         |  2 ++
 website/docs/api/vectors.md               | 10 +++++-----
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 0650a7487..ffd7489b2 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -535,6 +535,10 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
         # every word has a vector
         assert nlp.vocab[word * 5].has_vector
 
+    # n_keys is -1 for floret
+    assert nlp_plain.vocab.vectors.n_keys > 0
+    assert nlp.vocab.vectors.n_keys == -1
+
     # check that single and batched vector lookups are identical
     words = [s for s in nlp_plain.vocab.vectors]
     single_vecs = OPS.to_numpy(OPS.asarray([nlp.vocab[word].vector for word in words]))
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index bc4863703..2b1ea764b 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -170,6 +170,8 @@ cdef class Vectors:
 
         DOCS: https://spacy.io/api/vectors#n_keys
         """
+        if self.mode == Mode.floret:
+            return -1
         return len(self.key2row)
 
     def __reduce__(self):
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index b3bee822c..a651c23b0 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -327,9 +327,9 @@ will be counted individually. In `floret` mode, the keys table is not used.
 > assert vectors.n_keys == 0
 > ```
 
-| Name        | Description                                  |
-| ----------- | -------------------------------------------- |
-| **RETURNS** | The number of all keys in the table. ~~int~~ |
+| Name        | Description                                                                   |
+| ----------- | ----------------------------------------------------------------------------- |
+| **RETURNS** | The number of all keys in the table. Returns `-1` for floret vectors. ~~int~~ |
 
 ## Vectors.most_similar {#most_similar tag="method"}
 
@@ -348,7 +348,7 @@ supported for `floret` mode.
 > ```
 
 | Name           | Description                                                                 |
-| -------------- | --------------------------------------------------------------------------- |
+| -------------- | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
 | `queries`      | An array with one or more vectors. ~~numpy.ndarray~~                        |
 | _keyword-only_ |                                                                             |
 | `batch_size`   | The batch size to use. Default to `1024`. ~~int~~                           |
@@ -385,7 +385,7 @@ Change the embedding matrix to use different Thinc ops.
 > ```
 
 | Name  | Description                                              |
-|-------|----------------------------------------------------------|
+| ----- | -------------------------------------------------------- |
 | `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
 
 ## Vectors.to_disk {#to_disk tag="method"}

From 91acc3ea75d219ad07ed2b106e7b8bdcb01516dd Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 4 Mar 2022 17:17:36 +0900
Subject: [PATCH 045/123] Fix entity linker batching (#9669)

* Partial fix of entity linker batching

* Add import

* Better name

* Add `use_gold_ents` option, docs

* Change to v2, create stub v1, update docs etc.

* Fix error type

Honestly no idea what the right type to use here is.
ConfigValidationError seems wrong. Maybe a NotImplementedError?

* Make mypy happy

* Add hacky fix for init issue

* Add legacy pipeline entity linker

* Fix references to class name

* Add __init__.py for legacy

* Attempted fix for loss issue

* Remove placeholder V1

* formatting

* slightly more interesting train data

* Handle batches with no usable examples

This adds a test for batches that have docs but not entities, and a
check in the component that detects such cases and skips the update step
as thought the batch were empty.

* Remove todo about data verification

Check for empty data was moved further up so this should be OK now - the
case in question shouldn't be possible.

* Fix gradient calculation

The model doesn't know which entities are not in the kb, so it generates
embeddings for the context of all of them.

However, the loss does know which entities aren't in the kb, and it
ignores them, as there's no sensible gradient.

This has the issue that the gradient will not be calculated for some of
the input embeddings, which causes a dimension mismatch in backprop.
That should have caused a clear error, but with numpyops it was causing
nans to happen, which is another problem that should be addressed
separately.

This commit changes the loss to give a zero gradient for entities not in
the kb.

* add failing test for v1 EL legacy architecture

* Add nasty but simple working check for legacy arch

* Clarify why init hack works the way it does

* Clarify use_gold_ents use case

* Fix use gold ents related handling

* Add tests for no gold ents and fix other tests

* Use aligned ents function (not working)

This doesn't actually work because the "aligned" ents are gold-only. But
if I have a different function that returns the intersection, *then*
this will work as desired.

* Use proper matching ent check

This changes the process when gold ents are not used so that the
intersection of ents in the pred and gold is used.

* Move get_matching_ents to Example

* Use model attribute to check for legacy arch

* Rename flag

* bump spacy-legacy to lower 3.0.9

Co-authored-by: svlandeg <svlandeg@github.com>
---
 requirements.txt                              |   2 +-
 setup.cfg                                     |   2 +-
 spacy/cli/templates/quickstart_training.jinja |   4 +-
 spacy/ml/extract_spans.py                     |   2 +-
 spacy/ml/models/entity_linker.py              |  60 ++-
 spacy/pipeline/entity_linker.py               | 135 ++++--
 spacy/pipeline/legacy/__init__.py             |   3 +
 spacy/pipeline/legacy/entity_linker.py        | 427 ++++++++++++++++++
 spacy/tests/pipeline/test_entity_linker.py    | 151 ++++++-
 spacy/training/example.pyx                    |  23 +
 website/docs/api/architectures.md             |   4 +-
 website/docs/api/entitylinker.md              |   1 +
 12 files changed, 765 insertions(+), 49 deletions(-)
 create mode 100644 spacy/pipeline/legacy/__init__.py
 create mode 100644 spacy/pipeline/legacy/entity_linker.py

diff --git a/requirements.txt b/requirements.txt
index ca4099be5..b8970f686 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.8,<3.1.0
+spacy-legacy>=3.0.9,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
diff --git a/setup.cfg b/setup.cfg
index 586a044ff..ed3bf63ce 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -41,7 +41,7 @@ setup_requires =
     thinc>=8.0.12,<8.1.0
 install_requires =
     # Our libraries
-    spacy-legacy>=3.0.8,<3.1.0
+    spacy-legacy>=3.0.9,<3.1.0
     spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index fb79a4f60..da533b767 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -131,7 +131,7 @@ incl_context = true
 incl_prior = true
 
 [components.entity_linker.model]
-@architectures = "spacy.EntityLinker.v1"
+@architectures = "spacy.EntityLinker.v2"
 nO = null
 
 [components.entity_linker.model.tok2vec]
@@ -303,7 +303,7 @@ incl_context = true
 incl_prior = true
 
 [components.entity_linker.model]
-@architectures = "spacy.EntityLinker.v1"
+@architectures = "spacy.EntityLinker.v2"
 nO = null
 
 [components.entity_linker.model.tok2vec]
diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py
index edc86ff9c..d5e9bc07c 100644
--- a/spacy/ml/extract_spans.py
+++ b/spacy/ml/extract_spans.py
@@ -63,4 +63,4 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
 
 
 def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
-    return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths))
+    return Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths)
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 831fee90f..0149bea89 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,34 +1,82 @@
 from pathlib import Path
-from typing import Optional, Callable, Iterable, List
+from typing import Optional, Callable, Iterable, List, Tuple
 from thinc.types import Floats2d
 from thinc.api import chain, clone, list2ragged, reduce_mean, residual
-from thinc.api import Model, Maxout, Linear
+from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
 
 from ...util import registry
 from ...kb import KnowledgeBase, Candidate, get_candidates
 from ...vocab import Vocab
 from ...tokens import Span, Doc
+from ..extract_spans import extract_spans
+from ...errors import Errors
 
 
-@registry.architectures("spacy.EntityLinker.v1")
+@registry.architectures("spacy.EntityLinker.v2")
 def build_nel_encoder(
     tok2vec: Model, nO: Optional[int] = None
 ) -> Model[List[Doc], Floats2d]:
-    with Model.define_operators({">>": chain, "**": clone}):
+    with Model.define_operators({">>": chain, "&": tuplify}):
         token_width = tok2vec.maybe_get_dim("nO")
         output_layer = Linear(nO=nO, nI=token_width)
         model = (
-            tok2vec
-            >> list2ragged()
+            ((tok2vec >> list2ragged()) & build_span_maker())
+            >> extract_spans()
             >> reduce_mean()
             >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0))  # type: ignore[arg-type]
             >> output_layer
         )
         model.set_ref("output_layer", output_layer)
         model.set_ref("tok2vec", tok2vec)
+    # flag to show this isn't legacy
+    model.attrs["include_span_maker"] = True
     return model
 
 
+def build_span_maker(n_sents: int = 0) -> Model:
+    model: Model = Model("span_maker", forward=span_maker_forward)
+    model.attrs["n_sents"] = n_sents
+    return model
+
+
+def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callable]:
+    ops = model.ops
+    n_sents = model.attrs["n_sents"]
+    candidates = []
+    for doc in docs:
+        cands = []
+        try:
+            sentences = [s for s in doc.sents]
+        except ValueError:
+            # no sentence info, normal in initialization
+            for tok in doc:
+                tok.is_sent_start = tok.i == 0
+            sentences = [doc[:]]
+        for ent in doc.ents:
+            try:
+                # find the sentence in the list of sentences.
+                sent_index = sentences.index(ent.sent)
+            except AttributeError:
+                # Catch the exception when ent.sent is None and provide a user-friendly warning
+                raise RuntimeError(Errors.E030) from None
+            # get n previous sentences, if there are any
+            start_sentence = max(0, sent_index - n_sents)
+            # get n posterior sentences, or as many < n as there are
+            end_sentence = min(len(sentences) - 1, sent_index + n_sents)
+            # get token positions
+            start_token = sentences[start_sentence].start
+            end_token = sentences[end_sentence].end
+            # save positions for extraction
+            cands.append((start_token, end_token))
+
+        candidates.append(ops.asarray2i(cands))
+    candlens = ops.asarray1i([len(cands) for cands in candidates])
+    candidates = ops.xp.concatenate(candidates)
+    outputs = Ragged(candidates, candlens)
+    # because this is just rearranging docs, the backprop does nothing
+    return outputs, lambda x: []
+
+
 @registry.misc("spacy.KBFromFile.v1")
 def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
     def kb_from_file(vocab):
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 1169e898d..89e7576bf 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -6,17 +6,17 @@ import srsly
 import random
 from thinc.api import CosineDistance, Model, Optimizer, Config
 from thinc.api import set_dropout_rate
-import warnings
 
 from ..kb import KnowledgeBase, Candidate
 from ..ml import empty_kb
 from ..tokens import Doc, Span
 from .pipe import deserialize_config
+from .legacy.entity_linker import EntityLinker_v1
 from .trainable_pipe import TrainablePipe
 from ..language import Language
 from ..vocab import Vocab
 from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..util import SimpleFrozenList, registry
 from .. import util
 from ..scorer import Scorer
@@ -26,7 +26,7 @@ BACKWARD_OVERWRITE = True
 
 default_model_config = """
 [model]
-@architectures = "spacy.EntityLinker.v1"
+@architectures = "spacy.EntityLinker.v2"
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -55,6 +55,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
         "overwrite": True,
         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
+        "use_gold_ents": True,
     },
     default_score_weights={
         "nel_micro_f": 1.0,
@@ -75,6 +76,7 @@ def make_entity_linker(
     get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
     overwrite: bool,
     scorer: Optional[Callable],
+    use_gold_ents: bool,
 ):
     """Construct an EntityLinker component.
 
@@ -90,6 +92,22 @@ def make_entity_linker(
         produces a list of candidates, given a certain knowledge base and a textual mention.
     scorer (Optional[Callable]): The scoring method.
     """
+
+    if not model.attrs.get("include_span_maker", False):
+        # The only difference in arguments here is that use_gold_ents is not available
+        return EntityLinker_v1(
+            nlp.vocab,
+            model,
+            name,
+            labels_discard=labels_discard,
+            n_sents=n_sents,
+            incl_prior=incl_prior,
+            incl_context=incl_context,
+            entity_vector_length=entity_vector_length,
+            get_candidates=get_candidates,
+            overwrite=overwrite,
+            scorer=scorer,
+        )
     return EntityLinker(
         nlp.vocab,
         model,
@@ -102,6 +120,7 @@ def make_entity_linker(
         get_candidates=get_candidates,
         overwrite=overwrite,
         scorer=scorer,
+        use_gold_ents=use_gold_ents,
     )
 
 
@@ -136,6 +155,7 @@ class EntityLinker(TrainablePipe):
         get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
         overwrite: bool = BACKWARD_OVERWRITE,
         scorer: Optional[Callable] = entity_linker_score,
+        use_gold_ents: bool,
     ) -> None:
         """Initialize an entity linker.
 
@@ -152,6 +172,8 @@ class EntityLinker(TrainablePipe):
             produces a list of candidates, given a certain knowledge base and a textual mention.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_links.
+        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
+            component must provide entity annotations.
 
         DOCS: https://spacy.io/api/entitylinker#init
         """
@@ -169,6 +191,7 @@ class EntityLinker(TrainablePipe):
         # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
         self.kb = empty_kb(entity_vector_length)(self.vocab)
         self.scorer = scorer
+        self.use_gold_ents = use_gold_ents
 
     def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
         """Define the KB of this pipe by providing a function that will
@@ -212,14 +235,48 @@ class EntityLinker(TrainablePipe):
         doc_sample = []
         vector_sample = []
         for example in islice(get_examples(), 10):
-            doc_sample.append(example.x)
+            doc = example.x
+            if self.use_gold_ents:
+                doc.ents = example.y.ents
+            doc_sample.append(doc)
             vector_sample.append(self.model.ops.alloc1f(nO))
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
         assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
+
+        # XXX In order for size estimation to work, there has to be at least
+        # one entity. It's not used for training so it doesn't have to be real,
+        # so we add a fake one if none are present.
+        # We can't use Doc.has_annotation here because it can be True for docs
+        # that have been through an NER component but got no entities.
+        has_annotations = any([doc.ents for doc in doc_sample])
+        if not has_annotations:
+            doc = doc_sample[0]
+            ent = doc[0:1]
+            ent.label_ = "XXX"
+            doc.ents = (ent,)
+
         self.model.initialize(
             X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
         )
 
+        if not has_annotations:
+            # Clean up dummy annotation
+            doc.ents = []
+
+    def batch_has_learnable_example(self, examples):
+        """Check if a batch contains a learnable example.
+
+        If one isn't present, then the update step needs to be skipped.
+        """
+
+        for eg in examples:
+            for ent in eg.predicted.ents:
+                candidates = list(self.get_candidates(self.kb, ent))
+                if candidates:
+                    return True
+
+        return False
+
     def update(
         self,
         examples: Iterable[Example],
@@ -247,35 +304,29 @@ class EntityLinker(TrainablePipe):
         if not examples:
             return losses
         validate_examples(examples, "EntityLinker.update")
-        sentence_docs = []
-        for eg in examples:
-            sentences = [s for s in eg.reference.sents]
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.reference.ents:
-                # KB ID of the first token is the same as the whole span
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    try:
-                        # find the sentence in the list of sentences.
-                        sent_index = sentences.index(ent.sent)
-                    except AttributeError:
-                        # Catch the exception when ent.sent is None and provide a user-friendly warning
-                        raise RuntimeError(Errors.E030) from None
-                    # get n previous sentences, if there are any
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    # get n posterior sentences, or as many < n as there are
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    # get token positions
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    # append that span as a doc to training
-                    sent_doc = eg.predicted[start_token:end_token].as_doc()
-                    sentence_docs.append(sent_doc)
+
         set_dropout_rate(self.model, drop)
-        if not sentence_docs:
-            warnings.warn(Warnings.W093.format(name="Entity Linker"))
+        docs = [eg.predicted for eg in examples]
+        # save to restore later
+        old_ents = [doc.ents for doc in docs]
+
+        for doc, ex in zip(docs, examples):
+            if self.use_gold_ents:
+                doc.ents = ex.reference.ents
+            else:
+                # only keep matching ents
+                doc.ents = ex.get_matching_ents()
+
+        # make sure we have something to learn from, if not, short-circuit
+        if not self.batch_has_learnable_example(examples):
             return losses
-        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
+
+        sentence_encodings, bp_context = self.model.begin_update(docs)
+
+        # now restore the ents
+        for doc, old in zip(docs, old_ents):
+            doc.ents = old
+
         loss, d_scores = self.get_loss(
             sentence_encodings=sentence_encodings, examples=examples
         )
@@ -288,24 +339,38 @@ class EntityLinker(TrainablePipe):
     def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
         validate_examples(examples, "EntityLinker.get_loss")
         entity_encodings = []
+        eidx = 0  # indices in gold entities to keep
+        keep_ents = []  # indices in sentence_encodings to keep
+
         for eg in examples:
             kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
+
             for ent in eg.reference.ents:
                 kb_id = kb_ids[ent.start]
                 if kb_id:
                     entity_encoding = self.kb.get_vector(kb_id)
                     entity_encodings.append(entity_encoding)
+                    keep_ents.append(eidx)
+
+                eidx += 1
         entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
-        if sentence_encodings.shape != entity_encodings.shape:
+        selected_encodings = sentence_encodings[keep_ents]
+
+        # If the entity encodings list is empty, then
+        if selected_encodings.shape != entity_encodings.shape:
             err = Errors.E147.format(
                 method="get_loss", msg="gold entities do not match up"
             )
             raise RuntimeError(err)
         # TODO: fix typing issue here
-        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)  # type: ignore
-        loss = self.distance.get_loss(sentence_encodings, entity_encodings)  # type: ignore
+        gradients = self.distance.get_grad(selected_encodings, entity_encodings)  # type: ignore
+        # to match the input size, we need to give a zero gradient for items not in the kb
+        out = self.model.ops.alloc2f(*sentence_encodings.shape)
+        out[keep_ents] = gradients
+
+        loss = self.distance.get_loss(selected_encodings, entity_encodings)  # type: ignore
         loss = loss / len(entity_encodings)
-        return float(loss), gradients
+        return float(loss), out
 
     def predict(self, docs: Iterable[Doc]) -> List[str]:
         """Apply the pipeline's model to a batch of docs, without modifying them.
diff --git a/spacy/pipeline/legacy/__init__.py b/spacy/pipeline/legacy/__init__.py
new file mode 100644
index 000000000..f216840dc
--- /dev/null
+++ b/spacy/pipeline/legacy/__init__.py
@@ -0,0 +1,3 @@
+from .entity_linker import EntityLinker_v1
+
+__all__ = ["EntityLinker_v1"]
diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py
new file mode 100644
index 000000000..6440c18e5
--- /dev/null
+++ b/spacy/pipeline/legacy/entity_linker.py
@@ -0,0 +1,427 @@
+# This file is present to provide a prior version of the EntityLinker component
+# for backwards compatability. For details see #9669.
+
+from typing import Optional, Iterable, Callable, Dict, Union, List, Any
+from thinc.types import Floats2d
+from pathlib import Path
+from itertools import islice
+import srsly
+import random
+from thinc.api import CosineDistance, Model, Optimizer, Config
+from thinc.api import set_dropout_rate
+import warnings
+
+from ...kb import KnowledgeBase, Candidate
+from ...ml import empty_kb
+from ...tokens import Doc, Span
+from ..pipe import deserialize_config
+from ..trainable_pipe import TrainablePipe
+from ...language import Language
+from ...vocab import Vocab
+from ...training import Example, validate_examples, validate_get_examples
+from ...errors import Errors, Warnings
+from ...util import SimpleFrozenList, registry
+from ... import util
+from ...scorer import Scorer
+
+# See #9050
+BACKWARD_OVERWRITE = True
+
+
+def entity_linker_score(examples, **kwargs):
+    return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
+
+
+class EntityLinker_v1(TrainablePipe):
+    """Pipeline component for named entity linking.
+
+    DOCS: https://spacy.io/api/entitylinker
+    """
+
+    NIL = "NIL"  # string used to refer to a non-existing link
+
+    def __init__(
+        self,
+        vocab: Vocab,
+        model: Model,
+        name: str = "entity_linker",
+        *,
+        labels_discard: Iterable[str],
+        n_sents: int,
+        incl_prior: bool,
+        incl_context: bool,
+        entity_vector_length: int,
+        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+        overwrite: bool = BACKWARD_OVERWRITE,
+        scorer: Optional[Callable] = entity_linker_score,
+    ) -> None:
+        """Initialize an entity linker.
+
+        vocab (Vocab): The shared vocabulary.
+        model (thinc.api.Model): The Thinc Model powering the pipeline component.
+        name (str): The component instance name, used to add entries to the
+            losses during training.
+        labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
+        n_sents (int): The number of neighbouring sentences to take into account.
+        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
+        incl_context (bool): Whether or not to include the local context in the model.
+        entity_vector_length (int): Size of encoding vectors in the KB.
+        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
+            produces a list of candidates, given a certain knowledge base and a textual mention.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_links.
+
+        DOCS: https://spacy.io/api/entitylinker#init
+        """
+        self.vocab = vocab
+        self.model = model
+        self.name = name
+        self.labels_discard = list(labels_discard)
+        self.n_sents = n_sents
+        self.incl_prior = incl_prior
+        self.incl_context = incl_context
+        self.get_candidates = get_candidates
+        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
+        self.distance = CosineDistance(normalize=False)
+        # how many neighbour sentences to take into account
+        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
+        self.kb = empty_kb(entity_vector_length)(self.vocab)
+        self.scorer = scorer
+
+    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
+        """Define the KB of this pipe by providing a function that will
+        create it using this object's vocab."""
+        if not callable(kb_loader):
+            raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
+
+        self.kb = kb_loader(self.vocab)
+
+    def validate_kb(self) -> None:
+        # Raise an error if the knowledge base is not initialized.
+        if self.kb is None:
+            raise ValueError(Errors.E1018.format(name=self.name))
+        if len(self.kb) == 0:
+            raise ValueError(Errors.E139.format(name=self.name))
+
+    def initialize(
+        self,
+        get_examples: Callable[[], Iterable[Example]],
+        *,
+        nlp: Optional[Language] = None,
+        kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
+    ):
+        """Initialize the pipe for training, using a representative set
+        of data examples.
+
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
+        nlp (Language): The current nlp object the component is part of.
+        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
+            Note that providing this argument, will overwrite all data accumulated in the current KB.
+            Use this only when loading a KB as-such from file.
+
+        DOCS: https://spacy.io/api/entitylinker#initialize
+        """
+        validate_get_examples(get_examples, "EntityLinker_v1.initialize")
+        if kb_loader is not None:
+            self.set_kb(kb_loader)
+        self.validate_kb()
+        nO = self.kb.entity_vector_length
+        doc_sample = []
+        vector_sample = []
+        for example in islice(get_examples(), 10):
+            doc_sample.append(example.x)
+            vector_sample.append(self.model.ops.alloc1f(nO))
+        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
+        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
+        self.model.initialize(
+            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
+        )
+
+    def update(
+        self,
+        examples: Iterable[Example],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+    ) -> Dict[str, float]:
+        """Learn from a batch of documents and gold-standard information,
+        updating the pipe's model. Delegates to predict and get_loss.
+
+        examples (Iterable[Example]): A batch of Example objects.
+        drop (float): The dropout rate.
+        sgd (thinc.api.Optimizer): The optimizer.
+        losses (Dict[str, float]): Optional record of the loss during training.
+            Updated using the component name as the key.
+        RETURNS (Dict[str, float]): The updated losses dictionary.
+
+        DOCS: https://spacy.io/api/entitylinker#update
+        """
+        self.validate_kb()
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        if not examples:
+            return losses
+        validate_examples(examples, "EntityLinker_v1.update")
+        sentence_docs = []
+        for eg in examples:
+            sentences = [s for s in eg.reference.sents]
+            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
+            for ent in eg.reference.ents:
+                # KB ID of the first token is the same as the whole span
+                kb_id = kb_ids[ent.start]
+                if kb_id:
+                    try:
+                        # find the sentence in the list of sentences.
+                        sent_index = sentences.index(ent.sent)
+                    except AttributeError:
+                        # Catch the exception when ent.sent is None and provide a user-friendly warning
+                        raise RuntimeError(Errors.E030) from None
+                    # get n previous sentences, if there are any
+                    start_sentence = max(0, sent_index - self.n_sents)
+                    # get n posterior sentences, or as many < n as there are
+                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
+                    # get token positions
+                    start_token = sentences[start_sentence].start
+                    end_token = sentences[end_sentence].end
+                    # append that span as a doc to training
+                    sent_doc = eg.predicted[start_token:end_token].as_doc()
+                    sentence_docs.append(sent_doc)
+        set_dropout_rate(self.model, drop)
+        if not sentence_docs:
+            warnings.warn(Warnings.W093.format(name="Entity Linker"))
+            return losses
+        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
+        loss, d_scores = self.get_loss(
+            sentence_encodings=sentence_encodings, examples=examples
+        )
+        bp_context(d_scores)
+        if sgd is not None:
+            self.finish_update(sgd)
+        losses[self.name] += loss
+        return losses
+
+    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
+        validate_examples(examples, "EntityLinker_v1.get_loss")
+        entity_encodings = []
+        for eg in examples:
+            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
+            for ent in eg.reference.ents:
+                kb_id = kb_ids[ent.start]
+                if kb_id:
+                    entity_encoding = self.kb.get_vector(kb_id)
+                    entity_encodings.append(entity_encoding)
+        entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
+        if sentence_encodings.shape != entity_encodings.shape:
+            err = Errors.E147.format(
+                method="get_loss", msg="gold entities do not match up"
+            )
+            raise RuntimeError(err)
+        # TODO: fix typing issue here
+        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)  # type: ignore
+        loss = self.distance.get_loss(sentence_encodings, entity_encodings)  # type: ignore
+        loss = loss / len(entity_encodings)
+        return float(loss), gradients
+
+    def predict(self, docs: Iterable[Doc]) -> List[str]:
+        """Apply the pipeline's model to a batch of docs, without modifying them.
+        Returns the KB IDs for each entity in each doc, including NIL if there is
+        no prediction.
+
+        docs (Iterable[Doc]): The documents to predict.
+        RETURNS (List[str]): The models prediction for each document.
+
+        DOCS: https://spacy.io/api/entitylinker#predict
+        """
+        self.validate_kb()
+        entity_count = 0
+        final_kb_ids: List[str] = []
+        if not docs:
+            return final_kb_ids
+        if isinstance(docs, Doc):
+            docs = [docs]
+        for i, doc in enumerate(docs):
+            sentences = [s for s in doc.sents]
+            if len(doc) > 0:
+                # Looping through each entity (TODO: rewrite)
+                for ent in doc.ents:
+                    sent = ent.sent
+                    sent_index = sentences.index(sent)
+                    assert sent_index >= 0
+                    # get n_neighbour sentences, clipped to the length of the document
+                    start_sentence = max(0, sent_index - self.n_sents)
+                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
+                    start_token = sentences[start_sentence].start
+                    end_token = sentences[end_sentence].end
+                    sent_doc = doc[start_token:end_token].as_doc()
+                    # currently, the context is the same for each entity in a sentence (should be refined)
+                    xp = self.model.ops.xp
+                    if self.incl_context:
+                        sentence_encoding = self.model.predict([sent_doc])[0]
+                        sentence_encoding_t = sentence_encoding.T
+                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
+                    entity_count += 1
+                    if ent.label_ in self.labels_discard:
+                        # ignoring this entity - setting to NIL
+                        final_kb_ids.append(self.NIL)
+                    else:
+                        candidates = list(self.get_candidates(self.kb, ent))
+                        if not candidates:
+                            # no prediction possible for this entity - setting to NIL
+                            final_kb_ids.append(self.NIL)
+                        elif len(candidates) == 1:
+                            # shortcut for efficiency reasons: take the 1 candidate
+                            # TODO: thresholding
+                            final_kb_ids.append(candidates[0].entity_)
+                        else:
+                            random.shuffle(candidates)
+                            # set all prior probabilities to 0 if incl_prior=False
+                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                            if not self.incl_prior:
+                                prior_probs = xp.asarray([0.0 for _ in candidates])
+                            scores = prior_probs
+                            # add in similarity from the context
+                            if self.incl_context:
+                                entity_encodings = xp.asarray(
+                                    [c.entity_vector for c in candidates]
+                                )
+                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+                                if len(entity_encodings) != len(prior_probs):
+                                    raise RuntimeError(
+                                        Errors.E147.format(
+                                            method="predict",
+                                            msg="vectors not of equal length",
+                                        )
+                                    )
+                                # cosine similarity
+                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
+                                    sentence_norm * entity_norm
+                                )
+                                if sims.shape != prior_probs.shape:
+                                    raise ValueError(Errors.E161)
+                                scores = prior_probs + sims - (prior_probs * sims)
+                            # TODO: thresholding
+                            best_index = scores.argmax().item()
+                            best_candidate = candidates[best_index]
+                            final_kb_ids.append(best_candidate.entity_)
+        if not (len(final_kb_ids) == entity_count):
+            err = Errors.E147.format(
+                method="predict", msg="result variables not of equal length"
+            )
+            raise RuntimeError(err)
+        return final_kb_ids
+
+    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
+        """Modify a batch of documents, using pre-computed scores.
+
+        docs (Iterable[Doc]): The documents to modify.
+        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
+
+        DOCS: https://spacy.io/api/entitylinker#set_annotations
+        """
+        count_ents = len([ent for doc in docs for ent in doc.ents])
+        if count_ents != len(kb_ids):
+            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
+        i = 0
+        overwrite = self.cfg["overwrite"]
+        for doc in docs:
+            for ent in doc.ents:
+                kb_id = kb_ids[i]
+                i += 1
+                for token in ent:
+                    if token.ent_kb_id == 0 or overwrite:
+                        token.ent_kb_id_ = kb_id
+
+    def to_bytes(self, *, exclude=tuple()):
+        """Serialize the pipe to a bytestring.
+
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (bytes): The serialized object.
+
+        DOCS: https://spacy.io/api/entitylinker#to_bytes
+        """
+        self._validate_serialization_attrs()
+        serialize = {}
+        if hasattr(self, "cfg") and self.cfg is not None:
+            serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
+        serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
+        serialize["kb"] = self.kb.to_bytes
+        serialize["model"] = self.model.to_bytes
+        return util.to_bytes(serialize, exclude)
+
+    def from_bytes(self, bytes_data, *, exclude=tuple()):
+        """Load the pipe from a bytestring.
+
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (TrainablePipe): The loaded object.
+
+        DOCS: https://spacy.io/api/entitylinker#from_bytes
+        """
+        self._validate_serialization_attrs()
+
+        def load_model(b):
+            try:
+                self.model.from_bytes(b)
+            except AttributeError:
+                raise ValueError(Errors.E149) from None
+
+        deserialize = {}
+        if hasattr(self, "cfg") and self.cfg is not None:
+            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
+        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
+        deserialize["kb"] = lambda b: self.kb.from_bytes(b)
+        deserialize["model"] = load_model
+        util.from_bytes(bytes_data, deserialize, exclude)
+        return self
+
+    def to_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> None:
+        """Serialize the pipe to disk.
+
+        path (str / Path): Path to a directory.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+
+        DOCS: https://spacy.io/api/entitylinker#to_disk
+        """
+        serialize = {}
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
+        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
+        serialize["kb"] = lambda p: self.kb.to_disk(p)
+        serialize["model"] = lambda p: self.model.to_disk(p)
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> "EntityLinker_v1":
+        """Load the pipe from disk. Modifies the object in place and returns it.
+
+        path (str / Path): Path to a directory.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (EntityLinker): The modified EntityLinker object.
+
+        DOCS: https://spacy.io/api/entitylinker#from_disk
+        """
+
+        def load_model(p):
+            try:
+                with p.open("rb") as infile:
+                    self.model.from_bytes(infile.read())
+            except AttributeError:
+                raise ValueError(Errors.E149) from None
+
+        deserialize: Dict[str, Callable[[Any], Any]] = {}
+        deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
+        deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
+        deserialize["kb"] = lambda p: self.kb.from_disk(p)
+        deserialize["model"] = load_model
+        util.from_disk(path, deserialize, exclude)
+        return self
+
+    def rehearse(self, examples, *, sgd=None, losses=None, **config):
+        raise NotImplementedError
+
+    def add_label(self, label):
+        raise NotImplementedError
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 3740e430e..7d1382741 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -9,6 +9,9 @@ from spacy.compat import pickle
 from spacy.kb import Candidate, KnowledgeBase, get_candidates
 from spacy.lang.en import English
 from spacy.ml import load_kb
+from spacy.pipeline import EntityLinker
+from spacy.pipeline.legacy import EntityLinker_v1
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Span
@@ -168,6 +171,45 @@ def test_issue7065_b():
     assert doc
 
 
+def test_no_entities():
+    # Test that having no entities doesn't crash the model
+    TRAIN_DATA = [
+        (
+            "The sky is blue.",
+            {
+                "sent_starts": [1, 0, 0, 0, 0],
+            },
+        )
+    ]
+    nlp = English()
+    vector_length = 3
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))
+
+    def create_kb(vocab):
+        # create artificial KB
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
+        return mykb
+
+    # Create and train the Entity Linker
+    entity_linker = nlp.add_pipe("entity_linker", last=True)
+    entity_linker.set_kb(create_kb)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(2):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+    # adding additional components that are required for the entity_linker
+    nlp.add_pipe("sentencizer", first=True)
+
+    # this will run the pipeline on the examples and shouldn't crash
+    results = nlp.evaluate(train_examples)
+
+
 def test_partial_links():
     # Test that having some entities on the doc without gold links, doesn't crash
     TRAIN_DATA = [
@@ -650,7 +692,7 @@ TRAIN_DATA = [
          "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
     ("Russ Cochran his reprints include EC Comics.",
         {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
-         "entities": [(0, 12, "PERSON")],
+         "entities": [(0, 12, "PERSON"), (34, 43, "ART")],
          "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
     ("Russ Cochran has been publishing comic art.",
         {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
@@ -693,6 +735,7 @@ def test_overfitting_IO():
 
     # Create the Entity Linker component and add it to the pipeline
     entity_linker = nlp.add_pipe("entity_linker", last=True)
+    assert isinstance(entity_linker, EntityLinker)
     entity_linker.set_kb(create_kb)
     assert "Q2146908" in entity_linker.vocab.strings
     assert "Q2146908" in entity_linker.kb.vocab.strings
@@ -922,3 +965,109 @@ def test_scorer_links():
 
     assert scores["nel_micro_p"] == 2 / 3
     assert scores["nel_micro_r"] == 2 / 4
+
+
+# fmt: off
+@pytest.mark.parametrize(
+    "name,config",
+    [
+        ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
+        ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
+    ],
+)
+# fmt: on
+def test_legacy_architectures(name, config):
+    # Ensure that the legacy architectures still work
+    vector_length = 3
+    nlp = English()
+
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp.make_doc(text)
+        train_examples.append(Example.from_dict(doc, annotation))
+
+    def create_kb(vocab):
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="Russ Cochran",
+            entities=["Q2146908", "Q7381115"],
+            probabilities=[0.5, 0.5],
+        )
+        return mykb
+
+    entity_linker = nlp.add_pipe(name, config={"model": config})
+    if config["@architectures"] == "spacy.EntityLinker.v1":
+        assert isinstance(entity_linker, EntityLinker_v1)
+    else:
+        assert isinstance(entity_linker, EntityLinker)
+    entity_linker.set_kb(create_kb)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(2):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+    @pytest.mark.parametrize("patterns", [
+        # perfect case
+        [{"label": "CHARACTER", "pattern": "Kirby"}],
+        # typo for false negative
+        [{"label": "PERSON", "pattern": "Korby"}],
+        # random stuff for false positive
+        [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
+        ]
+    )
+    def test_no_gold_ents(patterns):
+        # test that annotating components work
+        TRAIN_DATA = [
+            (
+                "Kirby is pink",
+                {
+                    "links": {(0, 5): {"Q613241": 1.0}},
+                    "entities": [(0, 5, "CHARACTER")],
+                    "sent_starts": [1, 0, 0],
+                },
+            )
+        ]
+        nlp = English()
+        vector_length = 3
+        train_examples = []
+        for text, annotation in TRAIN_DATA:
+            doc = nlp(text)
+            train_examples.append(Example.from_dict(doc, annotation))
+
+        # Create a ruler to mark entities
+        ruler = nlp.add_pipe("entity_ruler")
+        ruler.add_patterns(patterns)
+
+        # Apply ruler to examples. In a real pipeline this would be an annotating component.
+        for eg in train_examples:
+            eg.predicted = ruler(eg.predicted)
+
+        def create_kb(vocab):
+            # create artificial KB
+            mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+            mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
+            mykb.add_alias("Kirby", ["Q613241"], [0.9])
+            # Placeholder
+            mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
+            mykb.add_alias("pink", ["pink"], [0.9])
+            return mykb
+
+
+        # Create and train the Entity Linker
+        entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
+        entity_linker.set_kb(create_kb)
+        assert entity_linker.use_gold_ents == False
+
+        optimizer = nlp.initialize(get_examples=lambda: train_examples)
+        for i in range(2):
+            losses = {}
+            nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+        # adding additional components that are required for the entity_linker
+        nlp.add_pipe("sentencizer", first=True)
+
+        # this will run the pipeline on the examples and shouldn't crash
+        results = nlp.evaluate(train_examples)
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index d792c9bbf..a2c5e08e9 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -256,6 +256,29 @@ cdef class Example:
         x_ents, x_tags = self.get_aligned_ents_and_ner()
         return x_tags
 
+    def get_matching_ents(self, check_label=True):
+        """Return entities that are shared between predicted and reference docs.
+
+        If `check_label` is True, entities must have matching labels to be
+        kept. Otherwise only the character indices need to match.
+        """
+        gold = {}
+        for ent in self.reference:
+            gold[(ent.start_char, ent.end_char)] = ent.label
+
+        keep = []
+        for ent in self.predicted:
+            key = (ent.start_char, ent.end_char)
+            if key not in gold:
+                continue
+
+            if check_label and ent.label != gold[key]:
+                continue
+
+            keep.append(ent)
+
+        return keep
+
     def to_dict(self):
         return {
             "doc_annotation": {
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 07b76393f..5fb3546a7 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -858,13 +858,13 @@ into the "real world". This requires 3 main components:
 - A machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
   most plausible ID from the set of candidates.
 
-### spacy.EntityLinker.v1 {#EntityLinker}
+### spacy.EntityLinker.v2 {#EntityLinker}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.EntityLinker.v1"
+> @architectures = "spacy.EntityLinker.v2"
 > nO = null
 >
 > [model.tok2vec]
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 3d3372679..8e0d6087a 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -59,6 +59,7 @@ architectures and their arguments and hyperparameters.
 | `incl_context`                           | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                   |
 | `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
 | `entity_vector_length`                   | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                            |
+| `use_gold_ents`                          | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                     |
 | `get_candidates`                         | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
 | `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                 |
 | `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                  |

From d89dac4066b3a245adb3982709bb7bb6eb9b9d63 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 4 Mar 2022 11:07:45 +0100
Subject: [PATCH 046/123] hook up meta in load_model_from_config (#10400)

---
 spacy/util.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index 2a8b9f5cc..66e257dd8 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -485,13 +485,16 @@ def load_model_from_path(
     config_path = model_path / "config.cfg"
     overrides = dict_to_dot(config)
     config = load_config(config_path, overrides=overrides)
-    nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude)
+    nlp = load_model_from_config(
+        config, vocab=vocab, disable=disable, exclude=exclude, meta=meta
+    )
     return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)
 
 
 def load_model_from_config(
     config: Union[Dict[str, Any], Config],
     *,
+    meta: Dict[str, Any] = SimpleFrozenDict(),
     vocab: Union["Vocab", bool] = True,
     disable: Iterable[str] = SimpleFrozenList(),
     exclude: Iterable[str] = SimpleFrozenList(),
@@ -529,6 +532,7 @@ def load_model_from_config(
         exclude=exclude,
         auto_fill=auto_fill,
         validate=validate,
+        meta=meta,
     )
     return nlp
 

From 6f4f57f3172112eb34336b0d6c0f0a0c930a5d1c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 7 Mar 2022 18:41:03 +0900
Subject: [PATCH 047/123] Update Issue Templates (#10446)

* Remove mention of python 3.10 wheels

These were released a while ago, just forgot to remove this notice.

* Add note about Discussions
---
 .github/ISSUE_TEMPLATE/01_bugs.md | 2 ++
 .github/ISSUE_TEMPLATE/config.yml | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/01_bugs.md b/.github/ISSUE_TEMPLATE/01_bugs.md
index 768832c24..255a5241e 100644
--- a/.github/ISSUE_TEMPLATE/01_bugs.md
+++ b/.github/ISSUE_TEMPLATE/01_bugs.md
@@ -4,6 +4,8 @@ about: Use this template if you came across a bug or unexpected behaviour differ
 
 ---
 
+<!-- NOTE: For questions or install related issues, please open a Discussion instead. -->
+
 ## How to reproduce the behaviour
 <!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
 
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index fce1a1064..31f89f917 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,8 +1,5 @@
 blank_issues_enabled: false
 contact_links:
-  - name: ⚠️ Python 3.10 Support
-    url: https://github.com/explosion/spaCy/discussions/9418
-    about: Python 3.10 wheels haven't been released yet, see the link for details.
   - name: 🗯 Discussions Forum
     url: https://github.com/explosion/spaCy/discussions
     about: Install issues, usage questions, general discussion and anything else that isn't a bug report.

From a6d5824e5f8361078f4075541e7fd41b304cf379 Mon Sep 17 00:00:00 2001
From: David Berenstein <david.m.berenstein@gmail.com>
Date: Mon, 7 Mar 2022 12:47:26 +0100
Subject: [PATCH 048/123] added classy-classification package to spacy universe
 (#10393)

* Update universe.json

added classy-classification to Spacy universe

* Update universe.json

added classy-classification to the spacy universe resources

* Update universe.json

corrected a small typo in json

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update universe.json

processed merge feedback

* Update universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/meta/universe.json | 47 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 6374600f2..0179830d0 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2599,6 +2599,53 @@
             },
             "category": ["pipeline"]
         },
+        {
+            "id": "classyclassification",
+            "slogan": "A Python library for classy few-shot and zero-shot classification within spaCy.",
+            "description": "Huggingface does offer some nice models for few/zero-shot classification, but these are not tailored to multi-lingual approaches. Rasa NLU has a nice approach for this, but its too embedded in their codebase for easy usage outside of Rasa/chatbots. Additionally, it made sense to integrate sentence-transformers and Huggingface zero-shot, instead of default word embeddings. Finally, I decided to integrate with spaCy, since training a custom spaCy TextCategorizer seems like a lot of hassle if you want something quick and dirty.",
+            "github": "davidberenstein1957/classy-classification",
+            "pip": "classy-classification",
+            "code_example": [
+                "import spacy",
+                "import classy_classification",
+                "",
+                "data = {",
+                "    \"furniture\": [\"This text is about chairs.\",",
+                "               \"Couches, benches and televisions.\",",
+                "               \"I really need to get a new sofa.\"],",
+                "    \"kitchen\": [\"There also exist things like fridges.\",",
+                "                \"I hope to be getting a new stove today.\",",
+                "                \"Do you also have some ovens.\"]",
+                "}",
+                "",
+                "nlp = spacy.load('en_core_web_md')",
+                "",
+                "classification_type = \"spacy_few_shot\"",
+                "if classification_type == \"spacy_few_shot\":",
+                "    nlp.add_pipe(\"text_categorizer\", ",
+                "        config={\"data\": data, \"model\": \"spacy\"}",
+                "    )",
+                "elif classification_type == \"sentence_transformer_few_shot\":",
+                "    nlp.add_pipe(\"text_categorizer\", ",
+                "        config={\"data\": data, \"model\": \"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\"}",
+                "    )",
+                "elif classification_type == \"huggingface_zero_shot\":",
+                "    nlp.add_pipe(\"text_categorizer\", ",
+                "        config={\"data\": list(data.keys()), \"cat_type\": \"zero\", \"model\": \"facebook/bart-large-mnli\"}",
+                "    )",
+                "",
+                "print(nlp(\"I am looking for kitchen appliances.\")._.cats)",
+                "print([doc._.cats for doc in nlp.pipe([\"I am looking for kitchen appliances.\"])])"    
+            ],
+            "author": "David Berenstein",
+            "author_links": {
+                "github": "davidberenstein1957",
+                "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
+            },
+            "category": ["pipeline", "standalone"],
+            "tags": ["classification", "zero-shot", "few-shot", "sentence-transformers", "huggingface"],
+            "spacy_version": 3
+        },
         {
             "id": "blackstone",
             "title": "Blackstone",

From 7ed7908716094ff41e4d1b2f60479f6b8356d700 Mon Sep 17 00:00:00 2001
From: jnphilipp <nathanael@philipp.land>
Date: Mon, 7 Mar 2022 16:20:39 +0100
Subject: [PATCH 049/123] Add Upper Sorbian support. (#10432)

* Add support basic support for upper sorbian.

* Add tokenizer exceptions and tests.

* Update spacy/lang/hsb/examples.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/lang/hsb/__init__.py             | 18 ++++++
 spacy/lang/hsb/examples.py             | 15 +++++
 spacy/lang/hsb/lex_attrs.py            | 77 ++++++++++++++++++++++++++
 spacy/lang/hsb/stop_words.py           | 19 +++++++
 spacy/lang/hsb/tokenizer_exceptions.py | 18 ++++++
 spacy/tests/conftest.py                |  5 ++
 spacy/tests/lang/hsb/__init__.py       |  0
 spacy/tests/lang/hsb/test_text.py      | 25 +++++++++
 spacy/tests/lang/hsb/test_tokenizer.py | 32 +++++++++++
 9 files changed, 209 insertions(+)
 create mode 100644 spacy/lang/hsb/__init__.py
 create mode 100644 spacy/lang/hsb/examples.py
 create mode 100644 spacy/lang/hsb/lex_attrs.py
 create mode 100644 spacy/lang/hsb/stop_words.py
 create mode 100644 spacy/lang/hsb/tokenizer_exceptions.py
 create mode 100644 spacy/tests/lang/hsb/__init__.py
 create mode 100644 spacy/tests/lang/hsb/test_text.py
 create mode 100644 spacy/tests/lang/hsb/test_tokenizer.py

diff --git a/spacy/lang/hsb/__init__.py b/spacy/lang/hsb/__init__.py
new file mode 100644
index 000000000..034d82319
--- /dev/null
+++ b/spacy/lang/hsb/__init__.py
@@ -0,0 +1,18 @@
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import Language, BaseDefaults
+
+
+class UpperSorbianDefaults(BaseDefaults):
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+
+
+class UpperSorbian(Language):
+    lang = "hsb"
+    Defaults = UpperSorbianDefaults
+
+
+__all__ = ["UpperSorbian"]
diff --git a/spacy/lang/hsb/examples.py b/spacy/lang/hsb/examples.py
new file mode 100644
index 000000000..0aafd5cee
--- /dev/null
+++ b/spacy/lang/hsb/examples.py
@@ -0,0 +1,15 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.hsb.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
+    "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
+    "A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
+    "Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
+    "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej."
+]
diff --git a/spacy/lang/hsb/lex_attrs.py b/spacy/lang/hsb/lex_attrs.py
new file mode 100644
index 000000000..dfda3e2db
--- /dev/null
+++ b/spacy/lang/hsb/lex_attrs.py
@@ -0,0 +1,77 @@
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "nul",
+    "jedyn", "jedna", "jedne",
+    "dwaj", "dwě",
+    "tři", "třo",
+    "štyri", "štyrjo",
+    "pjeć",
+    "šěsć",
+    "sydom",
+    "wosom",
+    "dźewjeć",
+    "dźesać",
+    "jědnaće",
+    "dwanaće",
+    "třinaće",
+    "štyrnaće",
+    "pjatnaće",
+    "šěsnaće",
+    "sydomnaće",
+    "wosomnaće",
+    "dźewjatnaće",
+    "dwaceći"
+    "třiceći",
+    "štyrceći",
+    "pjećdźesat",
+    "šěsćdźesat",
+    "sydomdźesat",
+    "wosomdźesat",
+    "dźewjećdźesat",
+    "sto",
+    "tysac",
+    "milion",
+    "miliarda",
+    "bilion",
+    "biliarda",
+    "trilion",
+    "triliarda",
+]
+
+_ordinal_words = [
+    "prěni", "prěnja", "prěnje",
+    "druhi", "druha", "druhe",
+    "třeći", "třeća", "třeće",
+    "štwórty", "štwórta", "štwórte",
+    "pjaty", "pjata", "pjate",
+    "šěsty", "šěsta", "šěste",
+    "sydmy", "sydma", "sydme",
+    "wosmy", "wosma", "wosme",
+    "dźewjaty", "dźewjata", "dźewjate",
+    "dźesaty", "dźesata", "dźesate",
+    "jědnaty", "jědnata", "jědnate",
+    "dwanaty", "dwanata", "dwanate"
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
diff --git a/spacy/lang/hsb/stop_words.py b/spacy/lang/hsb/stop_words.py
new file mode 100644
index 000000000..e6fedaf4c
--- /dev/null
+++ b/spacy/lang/hsb/stop_words.py
@@ -0,0 +1,19 @@
+STOP_WORDS = set(
+    """
+a abo ale ani
+
+dokelž
+
+hdyž
+
+jeli jelizo
+
+kaž
+
+pak potom
+
+tež tohodla
+
+zo zoby
+""".split()
+)
diff --git a/spacy/lang/hsb/tokenizer_exceptions.py b/spacy/lang/hsb/tokenizer_exceptions.py
new file mode 100644
index 000000000..4b9a4f98a
--- /dev/null
+++ b/spacy/lang/hsb/tokenizer_exceptions.py
@@ -0,0 +1,18 @@
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...symbols import ORTH, NORM
+from ...util import update_exc
+
+_exc = dict()
+for exc_data in [
+    {ORTH: "mil.", NORM: "milion"},
+    {ORTH: "wob.", NORM: "wobydler"},
+]:
+    _exc[exc_data[ORTH]] = [exc_data]
+
+for orth in [
+    "resp.",
+]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index f9266cb94..7083fd817 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -221,6 +221,11 @@ def ja_tokenizer():
     return get_lang_class("ja")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def hsb_tokenizer():
+    return get_lang_class("hsb")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def ko_tokenizer():
     pytest.importorskip("natto")
diff --git a/spacy/tests/lang/hsb/__init__.py b/spacy/tests/lang/hsb/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/hsb/test_text.py b/spacy/tests/lang/hsb/test_text.py
new file mode 100644
index 000000000..aaa4984eb
--- /dev/null
+++ b/spacy/tests/lang/hsb/test_text.py
@@ -0,0 +1,25 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10,000", True),
+        ("10,00", True),
+        ("jedne", True),
+        ("dwanaće", True),
+        ("milion", True),
+        ("sto", True),
+        ("załožene", False),
+        ("wona", False),
+        ("powšitkownej", False),
+        (",", False),
+        ("1/2", True),
+    ],
+)
+def test_lex_attrs_like_number(hsb_tokenizer, text, match):
+    tokens = hsb_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
diff --git a/spacy/tests/lang/hsb/test_tokenizer.py b/spacy/tests/lang/hsb/test_tokenizer.py
new file mode 100644
index 000000000..a3ec89ba0
--- /dev/null
+++ b/spacy/tests/lang/hsb/test_tokenizer.py
@@ -0,0 +1,32 @@
+import pytest
+
+HSB_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Hornjoserbšćina wobsteji resp. wobsteješe z wjacorych dialektow, kotrež so zdźěla chětro wot so rozeznawachu.",
+        [
+            "Hornjoserbšćina",
+            "wobsteji",
+            "resp.",
+            "wobsteješe",
+            "z",
+            "wjacorych",
+            "dialektow",
+            ",",
+            "kotrež",
+            "so",
+            "zdźěla",
+            "chětro",
+            "wot",
+            "so",
+            "rozeznawachu",
+            ".",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", HSB_BASIC_TOKENIZATION_TESTS)
+def test_hsb_tokenizer_basic(hsb_tokenizer, text, expected_tokens):
+    tokens = hsb_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list

From 61ba5450ff5de3c1bbbca21169772d1239ee822f Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 8 Mar 2022 00:56:57 +0900
Subject: [PATCH 050/123] Fix get_matching_ents (#10451)

* Fix get_matching_ents

Not sure what happened here - the code prior to this commit simply does
not work. It's already covered by entity linker tests, which were
succeeding in the NEL PR, but couldn't possibly succeed on master.

* Fix test

Test was indented inside another test and so doesn't seem to have been
running properly.
---
 spacy/tests/pipeline/test_entity_linker.py | 108 ++++++++++-----------
 spacy/training/example.pyx                 |   4 +-
 2 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 7d1382741..af2132d73 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1009,65 +1009,65 @@ def test_legacy_architectures(name, config):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
 
-    @pytest.mark.parametrize("patterns", [
-        # perfect case
-        [{"label": "CHARACTER", "pattern": "Kirby"}],
-        # typo for false negative
-        [{"label": "PERSON", "pattern": "Korby"}],
-        # random stuff for false positive
-        [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
-        ]
-    )
-    def test_no_gold_ents(patterns):
-        # test that annotating components work
-        TRAIN_DATA = [
-            (
-                "Kirby is pink",
-                {
-                    "links": {(0, 5): {"Q613241": 1.0}},
-                    "entities": [(0, 5, "CHARACTER")],
-                    "sent_starts": [1, 0, 0],
-                },
-            )
-        ]
-        nlp = English()
-        vector_length = 3
-        train_examples = []
-        for text, annotation in TRAIN_DATA:
-            doc = nlp(text)
-            train_examples.append(Example.from_dict(doc, annotation))
+@pytest.mark.parametrize("patterns", [
+    # perfect case
+    [{"label": "CHARACTER", "pattern": "Kirby"}],
+    # typo for false negative
+    [{"label": "PERSON", "pattern": "Korby"}],
+    # random stuff for false positive
+    [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
+    ]
+)
+def test_no_gold_ents(patterns):
+    # test that annotating components work
+    TRAIN_DATA = [
+        (
+            "Kirby is pink",
+            {
+                "links": {(0, 5): {"Q613241": 1.0}},
+                "entities": [(0, 5, "CHARACTER")],
+                "sent_starts": [1, 0, 0],
+            },
+        )
+    ]
+    nlp = English()
+    vector_length = 3
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))
 
-        # Create a ruler to mark entities
-        ruler = nlp.add_pipe("entity_ruler")
-        ruler.add_patterns(patterns)
+    # Create a ruler to mark entities
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
 
-        # Apply ruler to examples. In a real pipeline this would be an annotating component.
-        for eg in train_examples:
-            eg.predicted = ruler(eg.predicted)
+    # Apply ruler to examples. In a real pipeline this would be an annotating component.
+    for eg in train_examples:
+        eg.predicted = ruler(eg.predicted)
 
-        def create_kb(vocab):
-            # create artificial KB
-            mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
-            mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
-            mykb.add_alias("Kirby", ["Q613241"], [0.9])
-            # Placeholder
-            mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
-            mykb.add_alias("pink", ["pink"], [0.9])
-            return mykb
+    def create_kb(vocab):
+        # create artificial KB
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_alias("Kirby", ["Q613241"], [0.9])
+        # Placeholder
+        mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
+        mykb.add_alias("pink", ["pink"], [0.9])
+        return mykb
 
 
-        # Create and train the Entity Linker
-        entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
-        entity_linker.set_kb(create_kb)
-        assert entity_linker.use_gold_ents == False
+    # Create and train the Entity Linker
+    entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
+    entity_linker.set_kb(create_kb)
+    assert entity_linker.use_gold_ents == False
 
-        optimizer = nlp.initialize(get_examples=lambda: train_examples)
-        for i in range(2):
-            losses = {}
-            nlp.update(train_examples, sgd=optimizer, losses=losses)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(2):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
 
-        # adding additional components that are required for the entity_linker
-        nlp.add_pipe("sentencizer", first=True)
+    # adding additional components that are required for the entity_linker
+    nlp.add_pipe("sentencizer", first=True)
 
-        # this will run the pipeline on the examples and shouldn't crash
-        results = nlp.evaluate(train_examples)
+    # this will run the pipeline on the examples and shouldn't crash
+    results = nlp.evaluate(train_examples)
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index a2c5e08e9..778dfd12a 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -263,11 +263,11 @@ cdef class Example:
         kept. Otherwise only the character indices need to match.
         """
         gold = {}
-        for ent in self.reference:
+        for ent in self.reference.ents:
             gold[(ent.start_char, ent.end_char)] = ent.label
 
         keep = []
-        for ent in self.predicted:
+        for ent in self.predicted.ents:
             key = (ent.start_char, ent.end_char)
             if key not in gold:
                 continue

From 5ca0dbae765c405f3aa74e32ab9e93d5ce752179 Mon Sep 17 00:00:00 2001
From: jnphilipp <nathanael@philipp.land>
Date: Mon, 7 Mar 2022 16:57:14 +0100
Subject: [PATCH 051/123] Add Lower Sorbian support. (#10431)

* Add support basic support for lower sorbian.

* Add some test for dsb.

* Update spacy/lang/dsb/examples.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/lang/dsb/__init__.py             | 16 ++++++
 spacy/lang/dsb/examples.py             | 15 +++++
 spacy/lang/dsb/lex_attrs.py            | 77 ++++++++++++++++++++++++++
 spacy/lang/dsb/stop_words.py           | 15 +++++
 spacy/tests/conftest.py                |  5 ++
 spacy/tests/lang/dsb/__init__.py       |  0
 spacy/tests/lang/dsb/test_text.py      | 25 +++++++++
 spacy/tests/lang/dsb/test_tokenizer.py | 29 ++++++++++
 8 files changed, 182 insertions(+)
 create mode 100644 spacy/lang/dsb/__init__.py
 create mode 100644 spacy/lang/dsb/examples.py
 create mode 100644 spacy/lang/dsb/lex_attrs.py
 create mode 100644 spacy/lang/dsb/stop_words.py
 create mode 100644 spacy/tests/lang/dsb/__init__.py
 create mode 100644 spacy/tests/lang/dsb/test_text.py
 create mode 100644 spacy/tests/lang/dsb/test_tokenizer.py

diff --git a/spacy/lang/dsb/__init__.py b/spacy/lang/dsb/__init__.py
new file mode 100644
index 000000000..c66092a0c
--- /dev/null
+++ b/spacy/lang/dsb/__init__.py
@@ -0,0 +1,16 @@
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+from ...language import Language, BaseDefaults
+
+
+class LowerSorbianDefaults(BaseDefaults):
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+
+
+class LowerSorbian(Language):
+    lang = "dsb"
+    Defaults = LowerSorbianDefaults
+
+
+__all__ = ["LowerSorbian"]
diff --git a/spacy/lang/dsb/examples.py b/spacy/lang/dsb/examples.py
new file mode 100644
index 000000000..28b8c41f1
--- /dev/null
+++ b/spacy/lang/dsb/examples.py
@@ -0,0 +1,15 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.dsb.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
+    "Mi so tu jara derje spodoba.",
+    "Kotre nowniny chceće měć?",
+    "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
+    "Zwóstanjo pótakem hyšći wjele źěła."
+]
diff --git a/spacy/lang/dsb/lex_attrs.py b/spacy/lang/dsb/lex_attrs.py
new file mode 100644
index 000000000..75fb2e590
--- /dev/null
+++ b/spacy/lang/dsb/lex_attrs.py
@@ -0,0 +1,77 @@
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "nul",
+    "jaden", "jadna", "jadno",
+    "dwa", "dwě",
+    "tśi", "tśo",
+    "styri", "styrjo",
+    "pěś", "pěśo",
+    "šesć", "šesćo",
+    "sedym", "sedymjo",
+    "wósym", "wósymjo",
+    "źewjeś", "źewjeśo",
+    "źaseś", "źaseśo",
+    "jadnassćo",
+    "dwanassćo",
+    "tśinasćo",
+    "styrnasćo",
+    "pěśnasćo",
+    "šesnasćo",
+    "sedymnasćo",
+    "wósymnasćo",
+    "źewjeśnasćo",
+    "dwanasćo", "dwaźasća",
+    "tśiźasća",
+    "styrźasća",
+    "pěśźaset",
+    "šesćźaset",
+    "sedymźaset",
+    "wósymźaset",
+    "źewjeśźaset",
+    "sto",
+    "tysac",
+    "milion",
+    "miliarda",
+    "bilion",
+    "biliarda",
+    "trilion",
+    "triliarda",
+]
+
+_ordinal_words = [
+    "prědny", "prědna", "prědne",
+    "drugi", "druga", "druge",
+    "tśeśi", "tśeśa", "tśeśe",
+    "stwórty", "stwórta", "stwórte",
+    "pêty", "pěta", "pête",
+    "šesty", "šesta", "šeste",
+    "sedymy", "sedyma", "sedyme",
+    "wósymy", "wósyma", "wósyme",
+    "źewjety", "źewjeta", "źewjete",
+    "źasety", "źaseta", "źasete",
+    "jadnasty", "jadnasta", "jadnaste",
+    "dwanasty", "dwanasta", "dwanaste"
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
diff --git a/spacy/lang/dsb/stop_words.py b/spacy/lang/dsb/stop_words.py
new file mode 100644
index 000000000..376e04aa6
--- /dev/null
+++ b/spacy/lang/dsb/stop_words.py
@@ -0,0 +1,15 @@
+STOP_WORDS = set(
+    """
+a abo aby ako ale až
+
+daniž dokulaž
+
+gaž
+
+jolic
+
+pak pótom
+
+teke togodla
+""".split()
+)
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 7083fd817..24474c71e 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -99,6 +99,11 @@ def de_vocab():
     return get_lang_class("de")().vocab
 
 
+@pytest.fixture(scope="session")
+def dsb_tokenizer():
+    return get_lang_class("dsb")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def el_tokenizer():
     return get_lang_class("el")().tokenizer
diff --git a/spacy/tests/lang/dsb/__init__.py b/spacy/tests/lang/dsb/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/dsb/test_text.py b/spacy/tests/lang/dsb/test_text.py
new file mode 100644
index 000000000..40f2c15e0
--- /dev/null
+++ b/spacy/tests/lang/dsb/test_text.py
@@ -0,0 +1,25 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10,000", True),
+        ("10,00", True),
+        ("jadno", True),
+        ("dwanassćo", True),
+        ("milion", True),
+        ("sto", True),
+        ("ceła", False),
+        ("kopica", False),
+        ("narěcow", False),
+        (",", False),
+        ("1/2", True),
+    ],
+)
+def test_lex_attrs_like_number(dsb_tokenizer, text, match):
+    tokens = dsb_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
diff --git a/spacy/tests/lang/dsb/test_tokenizer.py b/spacy/tests/lang/dsb/test_tokenizer.py
new file mode 100644
index 000000000..135974fb8
--- /dev/null
+++ b/spacy/tests/lang/dsb/test_tokenizer.py
@@ -0,0 +1,29 @@
+import pytest
+
+DSB_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.",
+        [
+            "Ale",
+            "eksistěrujo",
+            "mimo",
+            "togo",
+            "ceła",
+            "kopica",
+            "narěcow",
+            ",",
+            "ako",
+            "na",
+            "pśikład",
+            "slěpjańska",
+            ".",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS)
+def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens):
+    tokens = dsb_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list

From b2bbefd0b542fcad527b9badf97fd1c3c69a7bbf Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 7 Mar 2022 17:03:45 +0100
Subject: [PATCH 052/123] Add Finnish, Korean, and Swedish models and Korean
 support notes (#10355)

* Add Finnish, Korean, and Swedish models to website

* Add Korean language support notes
---
 website/docs/usage/models.md | 47 +++++++++++++++++++++++++++++++++---
 website/meta/languages.json  | 21 +++++++++++++---
 2 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index 3b79c4d0d..f82da44d9 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -259,6 +259,45 @@ used for training the current [Japanese pipelines](/models/ja).
 
 </Infobox>
 
+### Korean language support {#korean}
+
+> #### mecab-ko tokenizer
+>
+> ```python
+> nlp = spacy.blank("ko")
+> ```
+
+The default MeCab-based Korean tokenizer requires:
+
+- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
+- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
+- [natto-py](https://github.com/buruzaemon/natto-py)
+
+For some Korean datasets and tasks, the
+[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
+than MeCab. To configure a Korean pipeline with the rule-based tokenizer:
+
+> #### Rule-based tokenizer
+>
+> ```python
+> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.Tokenizer.v1"}}}
+> nlp = spacy.blank("ko", config=config)
+> ```
+
+```ini
+### config.cfg
+[nlp]
+lang = "ko"
+tokenizer = {"@tokenizers" = "spacy.Tokenizer.v1"}
+```
+
+<Infobox>
+
+The [Korean trained pipelines](/models/ko) use the rule-based tokenizer, so no
+additional dependencies are required.
+
+</Infobox>
+
 ## Installing and using trained pipelines {#download}
 
 The easiest way to download a trained pipeline is via spaCy's
@@ -417,10 +456,10 @@ doc = nlp("This is a sentence.")
 <Infobox title="Tip: Preview model info" emoji="💡">
 
 You can use the [`info`](/api/cli#info) command or
-[`spacy.info()`](/api/top-level#spacy.info) method to print a pipeline
-package's meta data before loading it. Each `Language` object with a loaded
-pipeline also exposes the pipeline's meta data as the attribute `meta`. For
-example, `nlp.meta['version']` will return the package version.
+[`spacy.info()`](/api/top-level#spacy.info) method to print a pipeline package's
+meta data before loading it. Each `Language` object with a loaded pipeline also
+exposes the pipeline's meta data as the attribute `meta`. For example,
+`nlp.meta['version']` will return the package version.
 
 </Infobox>
 
diff --git a/website/meta/languages.json b/website/meta/languages.json
index a7dda6482..1c4379b6d 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -114,7 +114,12 @@
         {
             "code": "fi",
             "name": "Finnish",
-            "has_examples": true
+            "has_examples": true,
+            "models": [
+                "fi_core_news_sm",
+                "fi_core_news_md",
+                "fi_core_news_lg"
+            ]
         },
         {
             "code": "fr",
@@ -227,7 +232,12 @@
                 }
             ],
             "example": "이것은 문장입니다.",
-            "has_examples": true
+            "has_examples": true,
+            "models": [
+                "ko_core_news_sm",
+                "ko_core_news_md",
+                "ko_core_news_lg"
+            ]
         },
         {
             "code": "ky",
@@ -388,7 +398,12 @@
         {
             "code": "sv",
             "name": "Swedish",
-            "has_examples": true
+            "has_examples": true,
+            "models": [
+                "sv_core_news_sm",
+                "sv_core_news_md",
+                "sv_core_news_lg"
+            ]
         },
         {
             "code": "ta",

From 60520d86693699c1221a4414a133f76ffb9601b0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 8 Mar 2022 13:51:11 +0100
Subject: [PATCH 053/123] Fix types in API docs for moves in parser and ner
 (#10464)

---
 website/docs/api/dependencyparser.md | 2 +-
 website/docs/api/entityrecognizer.md | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index 118cdc611..103e0826e 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -100,7 +100,7 @@ shortcut for this and instantiate the component using its string name and
 | `vocab`                       | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                                                    |
 | `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                |
 | `name`                        | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                                                 |
-| `moves`                       | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~                                                                                                                                                                                                         |
+| `moves`                       | A list of transition names. Inferred from the data if not provided. ~~Optional[TransitionSystem]~~                                                                                                                                                                                                  |
 | _keyword-only_                |                                                                                                                                                                                                                                                                                                     |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~                                                 |
 | `learn_tokens`                | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~                                                                                                                                                                       |
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 14b6fece4..7c153f064 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -62,7 +62,7 @@ architectures and their arguments and hyperparameters.
 
 | Setting                       | Description                                                                                                                                                                                                                                         |
 | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~                                                                                                                                     |
+| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[TransitionSystem]~~                                                                                                                              |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~                                                 |
 | `incorrect_spans_key`         | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER will learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~                                                                      |
@@ -98,7 +98,7 @@ shortcut for this and instantiate the component using its string name and
 | `vocab`                       | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                    |
 | `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                |
 | `name`                        | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                 |
-| `moves`                       | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[List[str]]~~                                                                                                                                  |
+| `moves`                       | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[TransitionSystem]~~                                                                                                                           |
 | _keyword-only_                |                                                                                                                                                                                                                                                     |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `incorrect_spans_key`         | Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group in [`Doc.spans`](/api/doc#spans), under this key. Defaults to `None`. ~~Optional[str]~~                        |

From 191e8b31fa75f60b32f9e4779fe629b3c31e7c5e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 8 Mar 2022 14:28:46 +0100
Subject: [PATCH 054/123] Remove English tokenizer exception May. (#10463)

---
 spacy/lang/en/tokenizer_exceptions.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index 55b544e42..2c20b8c27 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -447,7 +447,6 @@ for exc_data in [
     {ORTH: "La.", NORM: "Louisiana"},
     {ORTH: "Mar.", NORM: "March"},
     {ORTH: "Mass.", NORM: "Massachusetts"},
-    {ORTH: "May.", NORM: "May"},
     {ORTH: "Mich.", NORM: "Michigan"},
     {ORTH: "Minn.", NORM: "Minnesota"},
     {ORTH: "Miss.", NORM: "Mississippi"},

From 01ec6349eab7fd1d426a29bd6b9546826fb38bfa Mon Sep 17 00:00:00 2001
From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com>
Date: Tue, 8 Mar 2022 10:04:10 -0500
Subject: [PATCH 055/123] Add `path.mkdir` to custom component examples of
 `to_disk` (#10348)

* add `path.mkdir` to examples

* add ensure_path + mkdir

* update highlights
---
 website/docs/usage/processing-pipelines.md |  6 +++++-
 website/docs/usage/saving-loading.md       | 12 +++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 11fd1459d..9e6ee54df 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1081,13 +1081,17 @@ on [serialization methods](/usage/saving-loading/#serialization-methods).
 > directory.
 
 ```python
-### Custom serialization methods {highlight="6-7,9-11"}
+### Custom serialization methods {highlight="7-11,13-15"}
 import srsly
+from spacy.util import ensure_path
 
 class AcronymComponent:
     # other methods here...
 
     def to_disk(self, path, exclude=tuple()):
+        path = ensure_path(path)
+        if not path.exists():
+            path.mkdir()
         srsly.write_json(path / "data.json", self.data)
 
     def from_disk(self, path, exclude=tuple()):
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index 9dad077e7..af140e7a7 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -202,7 +202,9 @@ the data to and from a JSON file.
 > rules _with_ the component data.
 
 ```python
-### {highlight="14-18,20-25"}
+### {highlight="16-23,25-30"}
+from spacy.util import ensure_path
+
 @Language.factory("my_component")
 class CustomComponent:
     def __init__(self):
@@ -218,6 +220,9 @@ class CustomComponent:
 
     def to_disk(self, path, exclude=tuple()):
         # This will receive the directory path + /my_component
+        path = ensure_path(path)
+        if not path.exists():
+            path.mkdir()
         data_path = path / "data.json"
         with data_path.open("w", encoding="utf8") as f:
             f.write(json.dumps(self.data))
@@ -467,7 +472,12 @@ pipeline package. When you save out a pipeline using `nlp.to_disk` and the
 component exposes a `to_disk` method, it will be called with the disk path.
 
 ```python
+from spacy.util import ensure_path
+
 def to_disk(self, path, exclude=tuple()):
+    path = ensure_path(path)
+    if not path.exists():
+        path.mkdir()
     snek_path = path / "snek.txt"
     with snek_path.open("w", encoding="utf8") as snek_file:
         snek_file.write(self.snek)

From 297dd82c86372c7aa0a181e55dc72512718aafe8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Mar 2022 10:50:47 +0100
Subject: [PATCH 056/123] Fix initial special cases for Tokenizer.explain
 (#10460)

Add the missing initial check for special cases to `Tokenizer.explain`
to align with `Tokenizer._tokenize_affixes`.
---
 spacy/tests/tokenizer/test_tokenizer.py   | 13 +++++++++++
 spacy/tokenizer.pyx                       |  4 ++++
 website/docs/usage/linguistic-features.md | 28 ++++++++++++++---------
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index a7270cb1e..ed11508b4 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -521,3 +521,16 @@ def test_tokenizer_infix_prefix(en_vocab):
     assert tokens == ["±10", "%"]
     explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
     assert tokens == explain_tokens
+
+
+def test_tokenizer_initial_special_case_explain(en_vocab):
+    tokenizer = Tokenizer(
+        en_vocab,
+        token_match=re.compile("^id$").match,
+        rules={
+            "id": [{"ORTH": "i"}, {"ORTH": "d"}],
+        }
+    )
+    tokens = [t.text for t in tokenizer("id")]
+    explain_tokens = [t[1] for t in tokenizer.explain("id")]
+    assert tokens == explain_tokens
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 91f228032..ac55a61f3 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -643,6 +643,10 @@ cdef class Tokenizer:
         for substring in text.split():
             suffixes = []
             while substring:
+                if substring in special_cases:
+                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                    substring = ''
+                    continue
                 while prefix_search(substring) or suffix_search(substring):
                     if token_match(substring):
                         tokens.append(("TOKEN_MATCH", substring))
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index f8baf5588..c3f25565a 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -799,6 +799,10 @@ def tokenizer_pseudo_code(
     for substring in text.split():
         suffixes = []
         while substring:
+            if substring in special_cases:
+                tokens.extend(special_cases[substring])
+                substring = ""
+                continue
             while prefix_search(substring) or suffix_search(substring):
                 if token_match(substring):
                     tokens.append(substring)
@@ -851,20 +855,22 @@ def tokenizer_pseudo_code(
 The algorithm can be summarized as follows:
 
 1. Iterate over space-separated substrings.
-2. Look for a token match. If there is a match, stop processing and keep this
-   token.
-3. Check whether we have an explicitly defined special case for this substring.
+2. Check whether we have an explicitly defined special case for this substring.
    If we do, use it.
-4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
+3. Look for a token match. If there is a match, stop processing and keep this
+   token.
+4. Check whether we have an explicitly defined special case for this substring.
+   If we do, use it.
+5. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #3,
    so that the token match and special cases always get priority.
-5. If we didn't consume a prefix, try to consume a suffix and then go back to
-   #2.
-6. If we can't consume a prefix or a suffix, look for a URL match.
-7. If there's no URL match, then look for a special case.
-8. Look for "infixes" – stuff like hyphens etc. and split the substring into
+6. If we didn't consume a prefix, try to consume a suffix and then go back to
+   #3.
+7. If we can't consume a prefix or a suffix, look for a URL match.
+8. If there's no URL match, then look for a special case.
+9. Look for "infixes" – stuff like hyphens etc. and split the substring into
    tokens on all infixes.
-9. Once we can't consume any more of the string, handle it as a single token.
-10. Make a final pass over the text to check for special cases that include
+10. Once we can't consume any more of the string, handle it as a single token.
+11. Make a final pass over the text to check for special cases that include
     spaces or that were missed due to the incremental processing of affixes.
 
 </Accordion>

From 1bbf23207487da4463e8de96efdb2145b408823e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 11 Mar 2022 12:20:23 +0100
Subject: [PATCH 057/123] Auto-format code with black (#10479)

* Auto-format code with black

* Update spacy/lang/hsb/lex_attrs.py

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/lang/dsb/examples.py                 |  2 +-
 spacy/lang/dsb/lex_attrs.py                | 82 ++++++++++++++++------
 spacy/lang/hsb/examples.py                 |  2 +-
 spacy/lang/hsb/lex_attrs.py                | 63 ++++++++++++-----
 spacy/tests/pipeline/test_entity_linker.py | 24 ++++---
 5 files changed, 121 insertions(+), 52 deletions(-)

diff --git a/spacy/lang/dsb/examples.py b/spacy/lang/dsb/examples.py
index 28b8c41f1..6e9143826 100644
--- a/spacy/lang/dsb/examples.py
+++ b/spacy/lang/dsb/examples.py
@@ -11,5 +11,5 @@ sentences = [
     "Mi so tu jara derje spodoba.",
     "Kotre nowniny chceće měć?",
     "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
-    "Zwóstanjo pótakem hyšći wjele źěła."
+    "Zwóstanjo pótakem hyšći wjele źěła.",
 ]
diff --git a/spacy/lang/dsb/lex_attrs.py b/spacy/lang/dsb/lex_attrs.py
index 75fb2e590..367b3afb8 100644
--- a/spacy/lang/dsb/lex_attrs.py
+++ b/spacy/lang/dsb/lex_attrs.py
@@ -2,16 +2,27 @@ from ...attrs import LIKE_NUM
 
 _num_words = [
     "nul",
-    "jaden", "jadna", "jadno",
-    "dwa", "dwě",
-    "tśi", "tśo",
-    "styri", "styrjo",
-    "pěś", "pěśo",
-    "šesć", "šesćo",
-    "sedym", "sedymjo",
-    "wósym", "wósymjo",
-    "źewjeś", "źewjeśo",
-    "źaseś", "źaseśo",
+    "jaden",
+    "jadna",
+    "jadno",
+    "dwa",
+    "dwě",
+    "tśi",
+    "tśo",
+    "styri",
+    "styrjo",
+    "pěś",
+    "pěśo",
+    "šesć",
+    "šesćo",
+    "sedym",
+    "sedymjo",
+    "wósym",
+    "wósymjo",
+    "źewjeś",
+    "źewjeśo",
+    "źaseś",
+    "źaseśo",
     "jadnassćo",
     "dwanassćo",
     "tśinasćo",
@@ -21,7 +32,8 @@ _num_words = [
     "sedymnasćo",
     "wósymnasćo",
     "źewjeśnasćo",
-    "dwanasćo", "dwaźasća",
+    "dwanasćo",
+    "dwaźasća",
     "tśiźasća",
     "styrźasća",
     "pěśźaset",
@@ -40,18 +52,42 @@ _num_words = [
 ]
 
 _ordinal_words = [
-    "prědny", "prědna", "prědne",
-    "drugi", "druga", "druge",
-    "tśeśi", "tśeśa", "tśeśe",
-    "stwórty", "stwórta", "stwórte",
-    "pêty", "pěta", "pête",
-    "šesty", "šesta", "šeste",
-    "sedymy", "sedyma", "sedyme",
-    "wósymy", "wósyma", "wósyme",
-    "źewjety", "źewjeta", "źewjete",
-    "źasety", "źaseta", "źasete",
-    "jadnasty", "jadnasta", "jadnaste",
-    "dwanasty", "dwanasta", "dwanaste"
+    "prědny",
+    "prědna",
+    "prědne",
+    "drugi",
+    "druga",
+    "druge",
+    "tśeśi",
+    "tśeśa",
+    "tśeśe",
+    "stwórty",
+    "stwórta",
+    "stwórte",
+    "pêty",
+    "pěta",
+    "pête",
+    "šesty",
+    "šesta",
+    "šeste",
+    "sedymy",
+    "sedyma",
+    "sedyme",
+    "wósymy",
+    "wósyma",
+    "wósyme",
+    "źewjety",
+    "źewjeta",
+    "źewjete",
+    "źasety",
+    "źaseta",
+    "źasete",
+    "jadnasty",
+    "jadnasta",
+    "jadnaste",
+    "dwanasty",
+    "dwanasta",
+    "dwanaste",
 ]
 
 
diff --git a/spacy/lang/hsb/examples.py b/spacy/lang/hsb/examples.py
index 0aafd5cee..21f6f7584 100644
--- a/spacy/lang/hsb/examples.py
+++ b/spacy/lang/hsb/examples.py
@@ -11,5 +11,5 @@ sentences = [
     "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
     "A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
     "Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
-    "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej."
+    "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej.",
 ]
diff --git a/spacy/lang/hsb/lex_attrs.py b/spacy/lang/hsb/lex_attrs.py
index dfda3e2db..5f300a73d 100644
--- a/spacy/lang/hsb/lex_attrs.py
+++ b/spacy/lang/hsb/lex_attrs.py
@@ -2,10 +2,15 @@ from ...attrs import LIKE_NUM
 
 _num_words = [
     "nul",
-    "jedyn", "jedna", "jedne",
-    "dwaj", "dwě",
-    "tři", "třo",
-    "štyri", "štyrjo",
+    "jedyn",
+    "jedna",
+    "jedne",
+    "dwaj",
+    "dwě",
+    "tři",
+    "třo",
+    "štyri",
+    "štyrjo",
     "pjeć",
     "šěsć",
     "sydom",
@@ -21,7 +26,7 @@ _num_words = [
     "sydomnaće",
     "wosomnaće",
     "dźewjatnaće",
-    "dwaceći"
+    "dwaceći",
     "třiceći",
     "štyrceći",
     "pjećdźesat",
@@ -40,18 +45,42 @@ _num_words = [
 ]
 
 _ordinal_words = [
-    "prěni", "prěnja", "prěnje",
-    "druhi", "druha", "druhe",
-    "třeći", "třeća", "třeće",
-    "štwórty", "štwórta", "štwórte",
-    "pjaty", "pjata", "pjate",
-    "šěsty", "šěsta", "šěste",
-    "sydmy", "sydma", "sydme",
-    "wosmy", "wosma", "wosme",
-    "dźewjaty", "dźewjata", "dźewjate",
-    "dźesaty", "dźesata", "dźesate",
-    "jědnaty", "jědnata", "jědnate",
-    "dwanaty", "dwanata", "dwanate"
+    "prěni",
+    "prěnja",
+    "prěnje",
+    "druhi",
+    "druha",
+    "druhe",
+    "třeći",
+    "třeća",
+    "třeće",
+    "štwórty",
+    "štwórta",
+    "štwórte",
+    "pjaty",
+    "pjata",
+    "pjate",
+    "šěsty",
+    "šěsta",
+    "šěste",
+    "sydmy",
+    "sydma",
+    "sydme",
+    "wosmy",
+    "wosma",
+    "wosme",
+    "dźewjaty",
+    "dźewjata",
+    "dźewjate",
+    "dźesaty",
+    "dźesata",
+    "dźesate",
+    "jědnaty",
+    "jědnata",
+    "jědnate",
+    "dwanaty",
+    "dwanata",
+    "dwanate",
 ]
 
 
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index af2132d73..83d5bf0e2 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1009,14 +1009,17 @@ def test_legacy_architectures(name, config):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
 
-@pytest.mark.parametrize("patterns", [
-    # perfect case
-    [{"label": "CHARACTER", "pattern": "Kirby"}],
-    # typo for false negative
-    [{"label": "PERSON", "pattern": "Korby"}],
-    # random stuff for false positive
-    [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
-    ]
+
+@pytest.mark.parametrize(
+    "patterns",
+    [
+        # perfect case
+        [{"label": "CHARACTER", "pattern": "Kirby"}],
+        # typo for false negative
+        [{"label": "PERSON", "pattern": "Korby"}],
+        # random stuff for false positive
+        [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
+    ],
 )
 def test_no_gold_ents(patterns):
     # test that annotating components work
@@ -1055,9 +1058,10 @@ def test_no_gold_ents(patterns):
         mykb.add_alias("pink", ["pink"], [0.9])
         return mykb
 
-
     # Create and train the Entity Linker
-    entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
+    entity_linker = nlp.add_pipe(
+        "entity_linker", config={"use_gold_ents": False}, last=True
+    )
     entity_linker.set_kb(create_kb)
     assert entity_linker.use_gold_ents == False
 

From 6af6c2e86cc7b08573b261563786bd1ab87d45e9 Mon Sep 17 00:00:00 2001
From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
Date: Mon, 14 Mar 2022 16:41:31 +0800
Subject: [PATCH 058/123] Add a note to the dev docs on mypy (#10485)

---
 extra/DEVELOPER_DOCS/Code Conventions.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/extra/DEVELOPER_DOCS/Code Conventions.md b/extra/DEVELOPER_DOCS/Code Conventions.md
index eba466c46..37cd8ff27 100644
--- a/extra/DEVELOPER_DOCS/Code Conventions.md	
+++ b/extra/DEVELOPER_DOCS/Code Conventions.md	
@@ -137,7 +137,7 @@ If any of the TODOs you've added are important and should be fixed soon, you sho
 
 ## Type hints
 
-We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation.
+We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation. Ideally when developing, run `mypy spacy` on the code base to inspect any issues.
 
 If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values.
 
@@ -155,6 +155,13 @@ def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]:
     return callback
 ```
 
+For typing variables, we prefer the explicit format.
+
+```diff
+- var = value    # type: Type
++ var: Type = value
+```
+
 For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type).
 
 ```python

From 23bc93d3d286ca050ae18a9e120331d94454229d Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 14 Mar 2022 15:17:22 +0100
Subject: [PATCH 059/123] limit pytest to <7.1 (#10488)

* limit pytest to <7.1

* 7.1.0
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b8970f686..a034dec27 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,7 +26,7 @@ typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
-pytest>=5.2.0
+pytest>=5.2.0,<7.1.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<3.10.0

From b68bf43f5bf07b78c062777f35240f031374fe00 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 14 Mar 2022 15:47:57 +0100
Subject: [PATCH 060/123] Add spans to doc.to_json (#10073)

* Add spans to to_json

* adjustments to_json

* Change docstring

* change doc key naming

* Update spacy/tokens/doc.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/tests/doc/test_to_json.py | 12 +++++++++++-
 spacy/tokens/doc.pyx            | 11 ++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py
index 9ebee6c88..202281654 100644
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@@ -1,5 +1,5 @@
 import pytest
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 
 
 @pytest.fixture()
@@ -60,3 +60,13 @@ def test_doc_to_json_underscore_error_serialize(doc):
     Doc.set_extension("json_test4", method=lambda doc: doc.text)
     with pytest.raises(ValueError):
         doc.to_json(underscore=["json_test4"])
+
+
+def test_doc_to_json_span(doc):
+    """Test that Doc.to_json() includes spans"""
+    doc.spans["test"] = [Span(doc, 0, 2, "test"), Span(doc, 0, 1, "test")]
+    json_doc = doc.to_json()
+    assert "spans" in json_doc
+    assert len(json_doc["spans"]) == 1
+    assert len(json_doc["spans"]["test"]) == 2
+    assert json_doc["spans"]["test"][0]["start"] == 0
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index d33764ac9..1a48705fd 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1457,7 +1457,7 @@ cdef class Doc:
         underscore (list): Optional list of string names of custom doc._.
         attributes. Attribute values need to be JSON-serializable. Values will
         be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
-        RETURNS (dict): The data in spaCy's JSON format.
+        RETURNS (dict): The data in JSON format.
         """
         data = {"text": self.text}
         if self.has_annotation("ENT_IOB"):
@@ -1486,6 +1486,15 @@ cdef class Doc:
                 token_data["dep"] = token.dep_
                 token_data["head"] = token.head.i
             data["tokens"].append(token_data)
+        
+        if self.spans:
+            data["spans"] = {}
+            for span_group in self.spans:
+                data["spans"][span_group] = []
+                for span in self.spans[span_group]:
+                    span_data = {"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_}
+                    data["spans"][span_group].append(span_data)
+
         if underscore:
             data["_"] = {}
             for attr in underscore:

From 2eef47dd26a5acbc3f667a2bc3b1ddf16f2d1b07 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 14 Mar 2022 16:46:58 +0100
Subject: [PATCH 061/123] Save span candidates produced by spancat suggesters
 (#10413)

* Add save_candidates attribute

* Change spancat api

* Add unit test

* reimplement method to produce a list of doc

* Add method to docs

* Add new version tag

* Add intended use to docstring

* prettier formatting
---
 spacy/pipeline/spancat.py            | 18 ++++++++++++++++++
 spacy/tests/pipeline/test_spancat.py | 22 ++++++++++++++++++++++
 website/docs/api/spancategorizer.md  | 18 ++++++++++++++++++
 3 files changed, 58 insertions(+)

diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 3759466d1..0a6138fbc 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -272,6 +272,24 @@ class SpanCategorizer(TrainablePipe):
         scores = self.model.predict((docs, indices))  # type: ignore
         return indices, scores
 
+    def set_candidates(
+        self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
+    ) -> None:
+        """Use the spancat suggester to add a list of span candidates to a list of docs.
+        This method is intended to be used for debugging purposes.
+
+        docs (Iterable[Doc]): The documents to modify.
+        candidates_key (str): Key of the Doc.spans dict to save the candidate spans under.
+
+        DOCS: https://spacy.io/api/spancategorizer#set_candidates
+        """
+        suggester_output = self.suggester(docs, ops=self.model.ops)
+
+        for candidates, doc in zip(suggester_output, docs):  # type: ignore
+            doc.spans[candidates_key] = []
+            for index in candidates.dataXd:
+                doc.spans[candidates_key].append(doc[index[0] : index[1]])
+
     def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
         """Modify a batch of Doc objects, using pre-computed scores.
 
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 8060bc621..15256a763 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -397,3 +397,25 @@ def test_zero_suggestions():
     assert set(spancat.labels) == {"LOC", "PERSON"}
 
     nlp.update(train_examples, sgd=optimizer)
+
+
+def test_set_candidates():
+    nlp = Language()
+    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
+    train_examples = make_examples(nlp)
+    nlp.initialize(get_examples=lambda: train_examples)
+    texts = [
+        "Just a sentence.",
+        "I like London and Berlin",
+        "I like Berlin",
+        "I eat ham.",
+    ]
+
+    docs = [nlp(text) for text in texts]
+    spancat.set_candidates(docs)
+
+    assert len(docs) == len(texts)
+    assert type(docs[0].spans["candidates"]) == SpanGroup
+    assert len(docs[0].spans["candidates"]) == 9
+    assert docs[0].spans["candidates"][0].text == "Just"
+    assert docs[0].spans["candidates"][4].text == "Just a"
diff --git a/website/docs/api/spancategorizer.md b/website/docs/api/spancategorizer.md
index 26fcaefdf..fc666aaf7 100644
--- a/website/docs/api/spancategorizer.md
+++ b/website/docs/api/spancategorizer.md
@@ -239,6 +239,24 @@ Delegates to [`predict`](/api/spancategorizer#predict) and
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 
+## SpanCategorizer.set_candidates {#set_candidates tag="method", new="3.3"}
+
+Use the suggester to add a list of [`Span`](/api/span) candidates to a list of
+[`Doc`](/api/doc) objects. This method is intended to be used for debugging
+purposes.
+
+> #### Example
+>
+> ```python
+> spancat = nlp.add_pipe("spancat")
+> spancat.set_candidates(docs, "candidates")
+> ```
+
+| Name             | Description                                                          |
+| ---------------- | -------------------------------------------------------------------- |
+| `docs`           | The documents to modify. ~~Iterable[Doc]~~                           |
+| `candidates_key` | Key of the Doc.spans dict to save the candidate spans under. ~~str~~ |
+
 ## SpanCategorizer.get_loss {#get_loss tag="method"}
 
 Find the loss and gradient of loss for the batch of documents and their

From 0dc454ba9577262ba23279e66f5ea384dd6677fb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 15 Mar 2022 09:10:47 +0100
Subject: [PATCH 062/123] Update docs for Vocab.get_vector (#10486)

* Update docs for Vocab.get_vector

* Clarify description of 0-vector dimensions
---
 spacy/vocab.pyx           | 5 +++--
 website/docs/api/vocab.md | 9 +++------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index badd291ed..58036fffa 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -354,8 +354,9 @@ cdef class Vocab:
 
     def get_vector(self, orth):
         """Retrieve a vector for a word in the vocabulary. Words can be looked
-        up by string or int ID. If no vectors data is loaded, ValueError is
-        raised.
+        up by string or int ID. If the current vectors do not contain an entry
+        for the word, a 0-vector with the same number of dimensions as the
+        current vectors is returned.
 
         orth (int / unicode): The hash value of a word, or its unicode string.
         RETURNS (numpy.ndarray or cupy.ndarray): A word vector. Size
diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index c0a269d95..4698c68c3 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -168,22 +168,19 @@ cosines are calculated in minibatches to reduce memory usage.
 ## Vocab.get_vector {#get_vector tag="method" new="2"}
 
 Retrieve a vector for a word in the vocabulary. Words can be looked up by string
-or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn`
-is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s
-subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`).
+or hash value. If the current vectors do not contain an entry for the word, a
+0-vector with the same number of dimensions
+([`Vocab.vectors_length`](#attributes)) as the current vectors is returned.
 
 > #### Example
 >
 > ```python
 > nlp.vocab.get_vector("apple")
-> nlp.vocab.get_vector("apple", minn=1, maxn=5)
 > ```
 
 | Name                                | Description                                                                                                            |
 | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
 | `orth`                              | The hash value of a word, or its unicode string. ~~Union[int, str]~~                                                   |
-| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~                |
-| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~                |
 | **RETURNS**                         | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
 
 ## Vocab.set_vector {#set_vector tag="method" new="2"}

From 610001e8c724ee57fec301469454d80e955385a8 Mon Sep 17 00:00:00 2001
From: vincent d warmerdam <vincentwarmerdam@gmail.com>
Date: Tue, 15 Mar 2022 11:12:04 +0100
Subject: [PATCH 063/123] Update universe.json (#10490)

The project moved away from Rasa and into my personal GitHub account.
---
 website/meta/universe.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 0179830d0..e178eab1f 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -377,10 +377,10 @@
             "title": "whatlies",
             "slogan": "Make interactive visualisations to figure out 'what lies' in word embeddings.",
             "description": "This small library offers tools to make visualisation easier of both word embeddings as well as operations on them. It has support for spaCy prebuilt models as a first class citizen but also offers support for sense2vec. There's a convenient API to perform linear algebra as well as support for popular transformations like PCA/UMAP/etc.",
-            "github": "rasahq/whatlies",
+            "github": "koaning/whatlies",
             "pip": "whatlies",
             "thumb": "https://i.imgur.com/rOkOiLv.png",
-            "image": "https://raw.githubusercontent.com/RasaHQ/whatlies/master/docs/gif-two.gif",
+            "image": "https://raw.githubusercontent.com/koaning/whatlies/master/docs/gif-two.gif",
             "code_example": [
                 "from whatlies import EmbeddingSet",
                 "from whatlies.language import SpacyLanguage",

From e8357923ec873e5a66129a0ee84e05d42e9234cb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 15 Mar 2022 11:12:50 +0100
Subject: [PATCH 064/123] Various install docs updates (#10487)

* Simplify quickstart source install to use only editable pip install

* Update pytorch install instructions to more recent versions
---
 website/docs/usage/embeddings-transformers.md | 12 ++++++------
 website/src/widgets/quickstart-install.js     |  9 +--------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 708cdd8bf..70fa95099 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -211,23 +211,23 @@ PyTorch as a dependency below, but it may not find the best version for your
 setup.
 
 ```bash
-### Example: Install PyTorch 1.7.1 for CUDA 10.1 with pip
+### Example: Install PyTorch 1.11.0 for CUDA 11.3 with pip
 # See: https://pytorch.org/get-started/locally/
-$ pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+$ pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
 ```
 
 Next, install spaCy with the extras for your CUDA version and transformers. The
-CUDA extra (e.g., `cuda92`, `cuda102`, `cuda111`) installs the correct version
-of [`cupy`](https://docs.cupy.dev/en/stable/install.html#installing-cupy), which
+CUDA extra (e.g., `cuda102`, `cuda113`) installs the correct version of
+[`cupy`](https://docs.cupy.dev/en/stable/install.html#installing-cupy), which
 is just like `numpy`, but for GPU. You may also need to set the `CUDA_PATH`
 environment variable if your CUDA runtime is installed in a non-standard
-location. Putting it all together, if you had installed CUDA 10.2 in
+location. Putting it all together, if you had installed CUDA 11.3 in
 `/opt/nvidia/cuda`, you would run:
 
 ```bash
 ### Installation with CUDA
 $ export CUDA_PATH="/opt/nvidia/cuda"
-$ pip install -U %%SPACY_PKG_NAME[cuda102,transformers]%%SPACY_PKG_FLAGS
+$ pip install -U %%SPACY_PKG_NAME[cuda113,transformers]%%SPACY_PKG_FLAGS
 ```
 
 For [`transformers`](https://huggingface.co/transformers/) v4.0.0+ and models
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 1c8ad19da..fbf043c7d 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -214,16 +214,9 @@ const QuickstartInstall = ({ id, title }) => {
                             {nightly ? ` --branch ${DEFAULT_BRANCH}` : ''}
                         </QS>
                         <QS package="source">cd spaCy</QS>
-                        <QS package="source" os="linux">
-                            export PYTHONPATH=`pwd`
-                        </QS>
-                        <QS package="source" os="windows">
-                            set PYTHONPATH=C:\path\to\spaCy
-                        </QS>
                         <QS package="source">pip install -r requirements.txt</QS>
-                        <QS package="source">python setup.py build_ext --inplace</QS>
                         <QS package="source">
-                            pip install {train || hardware == 'gpu' ? `'.[${pipExtras}]'` : '.'}
+                            pip install --no-build-isolation --editable {train || hardware == 'gpu' ? `'.[${pipExtras}]'` : '.'}
                         </QS>
                         <QS config="train" package="conda" comment prompt={false}>
                             # packages only available via pip

From e5debc68e4910384351938f574ede7c9b35a2a5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 15 Mar 2022 14:15:31 +0100
Subject: [PATCH 065/123] Tagger: use unnormalized probabilities for inference
 (#10197)

* Tagger: use unnormalized probabilities for inference

Using unnormalized softmax avoids use of the relatively expensive exp function,
which can significantly speed up non-transformer models (e.g. I got a speedup
of 27% on a German tagging + parsing pipeline).

* Add spacy.Tagger.v2 with configurable normalization

Normalization of probabilities is disabled by default to improve
performance.

* Update documentation, models, and tests to spacy.Tagger.v2

* Move Tagger.v1 to spacy-legacy

* docs/architectures: run prettier

* Unnormalized softmax is now a Softmax_v2 option

* Require thinc 8.0.14 and spacy-legacy 3.0.9
---
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 setup.cfg                                     |  4 ++--
 spacy/cli/templates/quickstart_training.jinja |  8 +++----
 spacy/ml/models/tagger.py                     | 10 +++++----
 spacy/pipeline/morphologizer.pyx              |  2 +-
 spacy/pipeline/senter.pyx                     |  2 +-
 spacy/pipeline/tagger.pyx                     |  2 +-
 spacy/tests/pipeline/test_tok2vec.py          |  6 +++---
 .../tests/serialize/test_serialize_config.py  |  4 ++--
 .../serialize/test_serialize_language.py      |  2 +-
 spacy/tests/training/test_pretraining.py      |  6 +++---
 spacy/tests/training/test_training.py         |  2 +-
 website/docs/api/architectures.md             | 21 ++++++++++++++-----
 14 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f81484d43..a43b4c814 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.12,<8.1.0",
+    "thinc>=8.0.14,<8.1.0",
     "blis>=0.4.0,<0.8.0",
     "pathy",
     "numpy>=1.15.0",
diff --git a/requirements.txt b/requirements.txt
index a034dec27..4da6d5df6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.9,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.12,<8.1.0
+thinc>=8.0.14,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index ed3bf63ce..3c5ba884a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,7 +38,7 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.12,<8.1.0
+    thinc>=8.0.14,<8.1.0
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.9,<3.1.0
@@ -46,7 +46,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.12,<8.1.0
+    thinc>=8.0.14,<8.1.0
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.1,<1.1.0
     srsly>=2.4.1,<3.0.0
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index da533b767..b84fb3a8f 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -54,7 +54,7 @@ stride = 96
 factory = "morphologizer"
 
 [components.morphologizer.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null
 
 [components.morphologizer.model.tok2vec]
@@ -70,7 +70,7 @@ grad_factor = 1.0
 factory = "tagger"
 
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null
 
 [components.tagger.model.tok2vec]
@@ -238,7 +238,7 @@ maxout_pieces = 3
 factory = "morphologizer"
 
 [components.morphologizer.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null
 
 [components.morphologizer.model.tok2vec]
@@ -251,7 +251,7 @@ width = ${components.tok2vec.model.encode.width}
 factory = "tagger"
 
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null
 
 [components.tagger.model.tok2vec]
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index 9c7fe042d..9f8ef7b2b 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -1,14 +1,14 @@
 from typing import Optional, List
-from thinc.api import zero_init, with_array, Softmax, chain, Model
+from thinc.api import zero_init, with_array, Softmax_v2, chain, Model
 from thinc.types import Floats2d
 
 from ...util import registry
 from ...tokens import Doc
 
 
-@registry.architectures("spacy.Tagger.v1")
+@registry.architectures("spacy.Tagger.v2")
 def build_tagger_model(
-    tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
+    tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None, normalize=False
 ) -> Model[List[Doc], List[Floats2d]]:
     """Build a tagger model, using a provided token-to-vector component. The tagger
     model simply adds a linear layer with softmax activation to predict scores
@@ -19,7 +19,9 @@ def build_tagger_model(
     """
     # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
     t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    output_layer = Softmax(nO, t2v_width, init_W=zero_init)
+    output_layer = Softmax_v2(
+        nO, t2v_width, init_W=zero_init, normalize_outputs=normalize
+    )
     softmax = with_array(output_layer)  # type: ignore
     model = chain(tok2vec, softmax)
     model.set_ref("tok2vec", tok2vec)
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 73d3799b1..24f98508f 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -25,7 +25,7 @@ BACKWARD_EXTEND = False
 
 default_model_config = """
 [model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 
 [model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 6d00e829d..6808fe70e 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -20,7 +20,7 @@ BACKWARD_OVERWRITE = False
 
 default_model_config = """
 [model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index e21a9096e..d6ecbf084 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -27,7 +27,7 @@ BACKWARD_OVERWRITE = False
 
 default_model_config = """
 [model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index a5ac85e1e..37104c78a 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -100,7 +100,7 @@ cfg_string = """
     factory = "tagger"
 
     [components.tagger.model]
-    @architectures = "spacy.Tagger.v1"
+    @architectures = "spacy.Tagger.v2"
     nO = null
 
     [components.tagger.model.tok2vec]
@@ -263,7 +263,7 @@ cfg_string_multi = """
     factory = "tagger"
 
     [components.tagger.model]
-    @architectures = "spacy.Tagger.v1"
+    @architectures = "spacy.Tagger.v2"
     nO = null
 
     [components.tagger.model.tok2vec]
@@ -373,7 +373,7 @@ cfg_string_multi_textcat = """
     factory = "tagger"
 
     [components.tagger.model]
-    @architectures = "spacy.Tagger.v1"
+    @architectures = "spacy.Tagger.v2"
     nO = null
 
     [components.tagger.model.tok2vec]
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 1d50fd1d1..85e6f8b2c 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -59,7 +59,7 @@ subword_features = true
 factory = "tagger"
 
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 
 [components.tagger.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -110,7 +110,7 @@ subword_features = true
 factory = "tagger"
 
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 
 [components.tagger.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py
index 6e7fa0e4e..c03287548 100644
--- a/spacy/tests/serialize/test_serialize_language.py
+++ b/spacy/tests/serialize/test_serialize_language.py
@@ -70,7 +70,7 @@ factory = "ner"
 factory = "tagger"
 
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null
 
 [components.tagger.model.tok2vec]
diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py
index 8ee54b544..9359c8485 100644
--- a/spacy/tests/training/test_pretraining.py
+++ b/spacy/tests/training/test_pretraining.py
@@ -38,7 +38,7 @@ subword_features = true
 factory = "tagger"
 
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 
 [components.tagger.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -62,7 +62,7 @@ pipeline = ["tagger"]
 factory = "tagger"
 
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 
 [components.tagger.model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -106,7 +106,7 @@ subword_features = true
 factory = "tagger"
 
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 
 [components.tagger.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 0d73300d8..f1f8ce9d4 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -241,7 +241,7 @@ maxout_pieces = 3
 factory = "tagger"
 
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null
 
 [components.tagger.model.tok2vec]
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 5fb3546a7..2bddcb28c 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -104,7 +104,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
 > factory = "tagger"
 >
 > [components.tagger.model]
-> @architectures = "spacy.Tagger.v1"
+> @architectures = "spacy.Tagger.v2"
 >
 > [components.tagger.model.tok2vec]
 > @architectures = "spacy.Tok2VecListener.v1"
@@ -158,8 +158,8 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`,
 `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some
 subword information, without construction a fully character-based
 representation. If pretrained vectors are available, they can be included in the
-representation as well, with the vectors table kept static (i.e. it's
-not updated).
+representation as well, with the vectors table kept static (i.e. it's not
+updated).
 
 | Name                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -613,14 +613,15 @@ same signature, but the `use_upper` argument was `True` by default.
 
 ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
 
-### spacy.Tagger.v1 {#Tagger}
+### spacy.Tagger.v2 {#Tagger}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.Tagger.v1"
+> @architectures = "spacy.Tagger.v2"
 > nO = null
+> normalize = false
 >
 > [model.tok2vec]
 > # ...
@@ -634,8 +635,18 @@ the token vectors.
 | ----------- | ------------------------------------------------------------------------------------------ |
 | `tok2vec`   | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
 | `nO`        | The number of tags to output. Inferred from the data if `None`. ~~Optional[int]~~          |
+| `normalize` | Normalize probabilities during inference. Defaults to `False`. ~~bool~~                    |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                     |
 
+<Accordion title="Previous versions of spacy.Tagger" spaced>
+
+- The `normalize` argument was added in `spacy.Tagger.v2`. `spacy.Tagger.v1`
+  always normalizes probabilities during inference.
+
+The other arguments are shared between all versions.
+
+</Accordion>
+
 ## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
 
 A text classification architecture needs to take a [`Doc`](/api/doc) as input,

From e021dc6279621ccdb00bd69961d12a19e47218a1 Mon Sep 17 00:00:00 2001
From: David Berenstein <david.m.berenstein@gmail.com>
Date: Tue, 15 Mar 2022 16:42:33 +0100
Subject: [PATCH 066/123] Updated explenation for for classy classification
 (#10484)

* Update universe.json

added classy-classification to Spacy universe

* Update universe.json

added classy-classification to the spacy universe resources

* Update universe.json

corrected a small typo in json

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update universe.json

processed merge feedback

* Update universe.json

* updated information for Classy Classificaiton

Made a more comprehensible and easy description for Classy Classification based on feedback of Philip Vollet to prepare for sharing.

* added note about examples

* corrected for wrong formatting changes

* Update website/meta/universe.json with small typo correction

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* resolved another typo

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/meta/universe.json | 43 +++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index e178eab1f..a930363a4 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2601,8 +2601,9 @@
         },
         {
             "id": "classyclassification",
-            "slogan": "A Python library for classy few-shot and zero-shot classification within spaCy.",
-            "description": "Huggingface does offer some nice models for few/zero-shot classification, but these are not tailored to multi-lingual approaches. Rasa NLU has a nice approach for this, but its too embedded in their codebase for easy usage outside of Rasa/chatbots. Additionally, it made sense to integrate sentence-transformers and Huggingface zero-shot, instead of default word embeddings. Finally, I decided to integrate with spaCy, since training a custom spaCy TextCategorizer seems like a lot of hassle if you want something quick and dirty.",
+            "title": "Classy Classification",
+            "slogan": "Have you ever struggled with needing a spaCy TextCategorizer but didn't have the time to train one from scratch? Classy Classification is the way to go!",
+            "description": "Have you ever struggled with needing a [spaCy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Huggingface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification).",
             "github": "davidberenstein1957/classy-classification",
             "pip": "classy-classification",
             "code_example": [
@@ -2618,32 +2619,36 @@
                 "                \"Do you also have some ovens.\"]",
                 "}",
                 "",
+                "# see github repo for examples on sentence-transformers and Huggingface",
                 "nlp = spacy.load('en_core_web_md')",
-                "",
-                "classification_type = \"spacy_few_shot\"",
-                "if classification_type == \"spacy_few_shot\":",
-                "    nlp.add_pipe(\"text_categorizer\", ",
-                "        config={\"data\": data, \"model\": \"spacy\"}",
-                "    )",
-                "elif classification_type == \"sentence_transformer_few_shot\":",
-                "    nlp.add_pipe(\"text_categorizer\", ",
-                "        config={\"data\": data, \"model\": \"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\"}",
-                "    )",
-                "elif classification_type == \"huggingface_zero_shot\":",
-                "    nlp.add_pipe(\"text_categorizer\", ",
-                "        config={\"data\": list(data.keys()), \"cat_type\": \"zero\", \"model\": \"facebook/bart-large-mnli\"}",
-                "    )",
+                "nlp.add_pipe(\"text_categorizer\", ",
+                "    config={",
+                "        \"data\": data,",
+                "        \"model\": \"spacy\"",
+                "    }",
+                ")",
                 "",
                 "print(nlp(\"I am looking for kitchen appliances.\")._.cats)",
-                "print([doc._.cats for doc in nlp.pipe([\"I am looking for kitchen appliances.\"])])"    
+                "# Output:",
+                "#",
+                "# [{\"label\": \"furniture\", \"score\": 0.21}, {\"label\": \"kitchen\", \"score\": 0.79}]"
             ],
             "author": "David Berenstein",
             "author_links": {
                 "github": "davidberenstein1957",
                 "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
             },
-            "category": ["pipeline", "standalone"],
-            "tags": ["classification", "zero-shot", "few-shot", "sentence-transformers", "huggingface"],
+            "category": [
+                "pipeline",
+                "standalone"
+            ],
+            "tags": [
+                "classification",
+                "zero-shot",
+                "few-shot",
+                "sentence-transformers",
+                "huggingface"
+            ],
             "spacy_version": 3
         },
         {

From a79cd3542b3dd667d8a97293462e22ed26a04ee5 Mon Sep 17 00:00:00 2001
From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
Date: Thu, 17 Mar 2022 01:14:34 +0800
Subject: [PATCH 067/123] Add displacy support for overlapping Spans (#10332)

* Fix docstring for EntityRenderer

* Add warning in displacy if doc.spans are empty

* Implement parse_spans converter

One notable change here is that the default spans_key is sc, and
it's set by the user through the options.

* Implement SpanRenderer

Here, I implemented a SpanRenderer that looks similar to the
EntityRenderer except for some templates.  The spans_key, by default, is
set to sc, but can be configured in the options (see parse_spans). The
way I rendered these spans is per-token, i.e., I first check if each
token (1) belongs to a given span type and (2) a starting token of a
given span type. Once I have this information, I render them into the
markup.

* Fix mypy issues on typing

* Add tests for displacy spans support

* Update colors from RGB to hex

Co-authored-by: Ines Montani <ines@ines.io>

* Remove unnecessary CSS properties

* Add documentation for website

* Remove unnecesasry scripts

* Update wording on the documentation

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Put typing dependency on top of file

* Put back z-index so that spans overlap properly

* Make warning more explicit for spans_key

Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/displacy/__init__.py                    |  41 +++-
 spacy/displacy/render.py                      | 179 +++++++++++++++++-
 spacy/displacy/templates.py                   |  49 +++++
 spacy/errors.py                               |   4 +
 spacy/tests/test_displacy.py                  |  86 +++++++++
 website/docs/api/top-level.md                 |  32 +++-
 website/docs/images/displacy-span-custom.html |  31 +++
 website/docs/images/displacy-span.html        |  41 ++++
 website/docs/usage/visualizers.md             |  53 ++++++
 9 files changed, 501 insertions(+), 15 deletions(-)
 create mode 100644 website/docs/images/displacy-span-custom.html
 create mode 100644 website/docs/images/displacy-span.html

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index 25d530c83..aa00c95d8 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -4,10 +4,10 @@ spaCy's built in visualization suite for dependencies and named entities.
 DOCS: https://spacy.io/api/top-level#displacy
 USAGE: https://spacy.io/usage/visualizers
 """
-from typing import Union, Iterable, Optional, Dict, Any, Callable
+from typing import List, Union, Iterable, Optional, Dict, Any, Callable
 import warnings
 
-from .render import DependencyRenderer, EntityRenderer
+from .render import DependencyRenderer, EntityRenderer, SpanRenderer
 from ..tokens import Doc, Span
 from ..errors import Errors, Warnings
 from ..util import is_in_jupyter
@@ -44,6 +44,7 @@ def render(
     factories = {
         "dep": (DependencyRenderer, parse_deps),
         "ent": (EntityRenderer, parse_ents),
+        "span": (SpanRenderer, parse_spans),
     }
     if style not in factories:
         raise ValueError(Errors.E087.format(style=style))
@@ -203,6 +204,42 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
     return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
 
 
+def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
+    """Generate spans in [{start: i, end: i, label: 'label'}] format.
+
+    doc (Doc): Document to parse.
+    options (Dict[str, any]): Span-specific visualisation options.
+    RETURNS (dict): Generated span types keyed by text (original text) and spans.
+    """
+    kb_url_template = options.get("kb_url_template", None)
+    spans_key = options.get("spans_key", "sc")
+    spans = [
+        {
+            "start": span.start_char,
+            "end": span.end_char,
+            "start_token": span.start,
+            "end_token": span.end,
+            "label": span.label_,
+            "kb_id": span.kb_id_ if span.kb_id_ else "",
+            "kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
+        }
+        for span in doc.spans[spans_key]
+    ]
+    tokens = [token.text for token in doc]
+
+    if not spans:
+        warnings.warn(Warnings.W117.format(spans_key=spans_key))
+    title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
+    settings = get_doc_settings(doc)
+    return {
+        "text": doc.text,
+        "spans": spans,
+        "title": title,
+        "settings": settings,
+        "tokens": tokens,
+    }
+
+
 def set_render_wrapper(func: Callable[[str], str]) -> None:
     """Set an optional wrapper function that is called around the generated
     HTML markup on displacy.render. This can be used to allow integration into
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index a032d843b..2925c68a0 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -1,12 +1,15 @@
-from typing import Dict, Any, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 import uuid
+import itertools
 
-from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
-from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
-from .templates import TPL_ENTS, TPL_KB_LINK
-from ..util import minify_html, escape_html, registry
 from ..errors import Errors
-
+from ..util import escape_html, minify_html, registry
+from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
+from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
+from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
+from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
+from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
+from .templates import TPL_TITLE
 
 DEFAULT_LANG = "en"
 DEFAULT_DIR = "ltr"
@@ -33,6 +36,168 @@ DEFAULT_LABEL_COLORS = {
 }
 
 
+class SpanRenderer:
+    """Render Spans as SVGs."""
+
+    style = "span"
+
+    def __init__(self, options: Dict[str, Any] = {}) -> None:
+        """Initialise span renderer
+
+        options (dict): Visualiser-specific options (colors, spans)
+        """
+        # Set up the colors and overall look
+        colors = dict(DEFAULT_LABEL_COLORS)
+        user_colors = registry.displacy_colors.get_all()
+        for user_color in user_colors.values():
+            if callable(user_color):
+                # Since this comes from the function registry, we want to make
+                # sure we support functions that *return* a dict of colors
+                user_color = user_color()
+            if not isinstance(user_color, dict):
+                raise ValueError(Errors.E925.format(obj=type(user_color)))
+            colors.update(user_color)
+        colors.update(options.get("colors", {}))
+        self.default_color = DEFAULT_ENTITY_COLOR
+        self.colors = {label.upper(): color for label, color in colors.items()}
+
+        # Set up how the text and labels will be rendered
+        self.direction = DEFAULT_DIR
+        self.lang = DEFAULT_LANG
+        self.top_offset = options.get("top_offset", 40)
+        self.top_offset_step = options.get("top_offset_step", 17)
+
+        # Set up which templates will be used
+        template = options.get("template")
+        if template:
+            self.span_template = template["span"]
+            self.span_slice_template = template["slice"]
+            self.span_start_template = template["start"]
+        else:
+            if self.direction == "rtl":
+                self.span_template = TPL_SPAN_RTL
+                self.span_slice_template = TPL_SPAN_SLICE_RTL
+                self.span_start_template = TPL_SPAN_START_RTL
+            else:
+                self.span_template = TPL_SPAN
+                self.span_slice_template = TPL_SPAN_SLICE
+                self.span_start_template = TPL_SPAN_START
+
+    def render(
+        self, parsed: List[Dict[str, Any]], page: bool = False, minify: bool = False
+    ) -> str:
+        """Render complete markup.
+
+        parsed (list): Dependency parses to render.
+        page (bool): Render parses wrapped as full HTML page.
+        minify (bool): Minify HTML markup.
+        RETURNS (str): Rendered HTML markup.
+        """
+        rendered = []
+        for i, p in enumerate(parsed):
+            if i == 0:
+                settings = p.get("settings", {})
+                self.direction = settings.get("direction", DEFAULT_DIR)
+                self.lang = settings.get("lang", DEFAULT_LANG)
+            rendered.append(self.render_spans(p["tokens"], p["spans"], p.get("title")))
+
+        if page:
+            docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
+            markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction)
+        else:
+            markup = "".join(rendered)
+        if minify:
+            return minify_html(markup)
+        return markup
+
+    def render_spans(
+        self,
+        tokens: List[str],
+        spans: List[Dict[str, Any]],
+        title: Optional[str],
+    ) -> str:
+        """Render span types in text.
+
+        Spans are rendered per-token, this means that for each token, we check if it's part
+        of a span slice (a member of a span type) or a span start (the starting token of a
+        given span type).
+
+        tokens (list): Individual tokens in the text
+        spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
+        title (str / None): Document title set in Doc.user_data['title'].
+        """
+        per_token_info = []
+        for idx, token in enumerate(tokens):
+            # Identify if a token belongs to a Span (and which) and if it's a
+            # start token of said Span. We'll use this for the final HTML render
+            token_markup: Dict[str, Any] = {}
+            token_markup["text"] = token
+            entities = []
+            for span in spans:
+                ent = {}
+                if span["start_token"] <= idx < span["end_token"]:
+                    ent["label"] = span["label"]
+                    ent["is_start"] = True if idx == span["start_token"] else False
+                    kb_id = span.get("kb_id", "")
+                    kb_url = span.get("kb_url", "#")
+                    ent["kb_link"] = (
+                        TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
+                    )
+                    entities.append(ent)
+            token_markup["entities"] = entities
+            per_token_info.append(token_markup)
+
+        markup = self._render_markup(per_token_info)
+        markup = TPL_SPANS.format(content=markup, dir=self.direction)
+        if title:
+            markup = TPL_TITLE.format(title=title) + markup
+        return markup
+
+    def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
+        """Render the markup from per-token information"""
+        markup = ""
+        for token in per_token_info:
+            entities = sorted(token["entities"], key=lambda d: d["label"])
+            if entities:
+                slices = self._get_span_slices(token["entities"])
+                starts = self._get_span_starts(token["entities"])
+                markup += self.span_template.format(
+                    text=token["text"], span_slices=slices, span_starts=starts
+                )
+            else:
+                markup += escape_html(token["text"] + " ")
+        return markup
+
+    def _get_span_slices(self, entities: List[Dict]) -> str:
+        """Get the rendered markup of all Span slices"""
+        span_slices = []
+        for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
+            color = self.colors.get(entity["label"].upper(), self.default_color)
+            span_slice = self.span_slice_template.format(
+                bg=color, top_offset=self.top_offset + step
+            )
+            span_slices.append(span_slice)
+        return "".join(span_slices)
+
+    def _get_span_starts(self, entities: List[Dict]) -> str:
+        """Get the rendered markup of all Span start tokens"""
+        span_starts = []
+        for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
+            color = self.colors.get(entity["label"].upper(), self.default_color)
+            span_start = (
+                self.span_start_template.format(
+                    bg=color,
+                    top_offset=self.top_offset + step,
+                    label=entity["label"],
+                    kb_link=entity["kb_link"],
+                )
+                if entity["is_start"]
+                else ""
+            )
+            span_starts.append(span_start)
+        return "".join(span_starts)
+
+
 class DependencyRenderer:
     """Render dependency parses as SVGs."""
 
@@ -242,7 +407,7 @@ class EntityRenderer:
     style = "ent"
 
     def __init__(self, options: Dict[str, Any] = {}) -> None:
-        """Initialise dependency renderer.
+        """Initialise entity renderer.
 
         options (dict): Visualiser-specific options (colors, ents)
         """
diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py
index e7d3d4266..ff81e7a1d 100644
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@@ -62,6 +62,55 @@ TPL_ENT_RTL = """
 </mark>
 """
 
+TPL_SPANS = """
+<div class="spans" style="line-height: 2.5; direction: {dir}">{content}</div>
+"""
+
+TPL_SPAN = """
+<span style="font-weight: bold; display: inline-block; position: relative;">
+    {text}
+    {span_slices}
+    {span_starts}
+</span>
+"""
+
+TPL_SPAN_SLICE = """
+<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+</span>
+"""
+
+
+TPL_SPAN_START = """
+<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
+    <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
+        {label}{kb_link}
+    </span>
+</span>
+
+"""
+
+TPL_SPAN_RTL = """
+<span style="font-weight: bold; display: inline-block; position: relative;">
+    {text}
+    {span_slices}
+    {span_starts}
+</span>
+"""
+
+TPL_SPAN_SLICE_RTL = """
+<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+</span>
+"""
+
+TPL_SPAN_START_RTL = """
+<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
+    <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
+        {label}{kb_link}
+    </span>
+</span>
+"""
+
+
 # Important: this needs to start with a space!
 TPL_KB_LINK = """
  <a style="text-decoration: none; color: inherit; font-weight: normal" href="{kb_url}">{kb_id}</a>
diff --git a/spacy/errors.py b/spacy/errors.py
index 5399e489b..fe37351f7 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -192,6 +192,10 @@ class Warnings(metaclass=ErrorsWithCodes):
     W115 = ("Skipping {method}: the floret vector table cannot be modified. "
             "Vectors are calculated from character ngrams.")
     W116 = ("Unable to clean attribute '{attr}'.")
+    W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
+            "surprising to you, make sure the Doc was processed using a model "
+            "that supports span categorization, and check the `doc.spans[spans_key]` "
+            "property manually if necessary.")
 
 
 class Errors(metaclass=ErrorsWithCodes):
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index 392c95e42..ccad7e342 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -96,6 +96,92 @@ def test_issue5838():
     assert found == 4
 
 
+def test_displacy_parse_spans(en_vocab):
+    """Test that spans on a Doc are converted into displaCy's format."""
+    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
+    doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
+    spans = displacy.parse_spans(doc)
+    assert isinstance(spans, dict)
+    assert spans["text"] == "Welcome to the Bank of China "
+    assert spans["spans"] == [
+        {
+            "start": 15,
+            "end": 28,
+            "start_token": 3,
+            "end_token": 6,
+            "label": "ORG",
+            "kb_id": "",
+            "kb_url": "#",
+        },
+        {
+            "start": 23,
+            "end": 28,
+            "start_token": 5,
+            "end_token": 6,
+            "label": "GPE",
+            "kb_id": "",
+            "kb_url": "#",
+        },
+    ]
+
+
+def test_displacy_parse_spans_with_kb_id_options(en_vocab):
+    """Test that spans with kb_id on a Doc are converted into displaCy's format"""
+    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
+    doc.spans["sc"] = [
+        Span(doc, 3, 6, "ORG", kb_id="Q790068"),
+        Span(doc, 5, 6, "GPE", kb_id="Q148"),
+    ]
+
+    spans = displacy.parse_spans(
+        doc, {"kb_url_template": "https://wikidata.org/wiki/{}"}
+    )
+    assert isinstance(spans, dict)
+    assert spans["text"] == "Welcome to the Bank of China "
+    assert spans["spans"] == [
+        {
+            "start": 15,
+            "end": 28,
+            "start_token": 3,
+            "end_token": 6,
+            "label": "ORG",
+            "kb_id": "Q790068",
+            "kb_url": "https://wikidata.org/wiki/Q790068",
+        },
+        {
+            "start": 23,
+            "end": 28,
+            "start_token": 5,
+            "end_token": 6,
+            "label": "GPE",
+            "kb_id": "Q148",
+            "kb_url": "https://wikidata.org/wiki/Q148",
+        },
+    ]
+
+
+def test_displacy_parse_spans_different_spans_key(en_vocab):
+    """Test that spans in a different spans key will be parsed"""
+    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
+    doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
+    doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
+    spans = displacy.parse_spans(doc, options={"spans_key": "custom"})
+
+    assert isinstance(spans, dict)
+    assert spans["text"] == "Welcome to the Bank of China "
+    assert spans["spans"] == [
+        {
+            "start": 15,
+            "end": 28,
+            "start_token": 3,
+            "end_token": 6,
+            "label": "BANK",
+            "kb_id": "",
+            "kb_url": "#",
+        }
+    ]
+
+
 def test_displacy_parse_ents(en_vocab):
     """Test that named entities on a Doc are converted into displaCy's format."""
     doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 1a3e9da46..6d7431f28 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -320,12 +320,31 @@ If a setting is not present in the options, the default value will be used.
 | `template` <Tag variant="new">2.2</Tag>          | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
 | `kb_url_template` <Tag variant="new">3.2.1</Tag> | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in. ~~Optional[str]~~                                                                                       |
 
-By default, displaCy comes with colors for all entity types used by
-[spaCy's trained pipelines](/models). If you're using custom entity types, you
-can use the `colors` setting to add your own colors for them. Your application
-or pipeline package can also expose a
-[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
-to add custom labels and their colors automatically.
+
+#### Span Visualizer options {#displacy_options-span}
+
+> #### Example
+>
+> ```python
+> options = {"spans_key": "sc"}
+> displacy.serve(doc, style="span", options=options)
+> ```
+
+| Name            | Description                                                                                                                                             |
+|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `spans_key`       | Which spans key to render spans from. Default is `"sc"`. ~~str~~                                                                                                   |
+| `templates`       | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ |
+| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~                    |
+| `colors`          | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
+
+
+By default, displaCy comes with colors for all entity types used by [spaCy's
+trained pipelines](/models) for both entity and span visualizer. If you're
+using custom entity types, you can use the `colors` setting to add your own
+colors for them. Your application or pipeline package can also expose a
+[`spacy_displacy_colors` entry
+point](/usage/saving-loading#entry-points-displacy) to add custom labels and
+their colors automatically.
 
 By default, displaCy links to `#` for entities without a `kb_id` set on their
 span. If you wish to link an entity to their URL then consider using the
@@ -335,6 +354,7 @@ span. If you wish to link an entity to their URL then consider using the
 should redirect you to their Wikidata page, in this case
 `https://www.wikidata.org/wiki/Q95`.
 
+
 ## registry {#registry source="spacy/util.py" new="3"}
 
 spaCy's function registry extends
diff --git a/website/docs/images/displacy-span-custom.html b/website/docs/images/displacy-span-custom.html
new file mode 100644
index 000000000..97dd3b140
--- /dev/null
+++ b/website/docs/images/displacy-span-custom.html
@@ -0,0 +1,31 @@
+<div class="spans"
+    style="line-height: 2.5; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; font-size: 18px; direction: ltr">
+    Welcome to the
+    <span style="font-weight: bold; display: inline-block; position: relative;">
+        Bank
+        <span
+            style="background: #ddd; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+        </span>
+        <span
+            style="background: #ddd; top: 40px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
+            <span
+                style="background: #ddd; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
+                BANK
+            </span>
+        </span>
+    </span>
+    <span style="font-weight: bold; display: inline-block; position: relative;">
+        of
+        <span
+            style="background: #ddd; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+        </span>
+    </span>
+    <span style="font-weight: bold; display: inline-block; position: relative;">
+        China
+
+        <span
+            style="background: #ddd; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+        </span>
+    </span>
+    .
+</div>
\ No newline at end of file
diff --git a/website/docs/images/displacy-span.html b/website/docs/images/displacy-span.html
new file mode 100644
index 000000000..9bbc6403c
--- /dev/null
+++ b/website/docs/images/displacy-span.html
@@ -0,0 +1,41 @@
+<div class="spans"
+    style="line-height: 2.5; direction: ltr; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; font-size: 18px">
+    Welcome to the
+    <span style="font-weight: bold; display: inline-block; position: relative;">
+        Bank
+        <span
+            style="background: #7aecec; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+        </span>
+        <span
+            style="background: #7aecec; top: 40px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
+            <span
+                style="background: #7aecec; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
+                ORG
+            </span>
+        </span>
+    </span>
+    <span style="font-weight: bold; display: inline-block; position: relative;">
+        of
+
+        <span
+            style="background: #7aecec; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+        </span>
+    </span>
+    <span style="font-weight: bold; display: inline-block; position: relative;">
+        China
+        <span
+            style="background: #7aecec; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+        </span>
+        <span
+            style="background: #feca74; top: 57px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
+        </span>
+        <span
+            style="background: #feca74; top: 57px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
+            <span
+                style="background: #feca74; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
+                GPE
+            </span>
+        </span>
+    </span>
+    .
+</div>
\ No newline at end of file
diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md
index 072718f91..f98c43224 100644
--- a/website/docs/usage/visualizers.md
+++ b/website/docs/usage/visualizers.md
@@ -167,6 +167,59 @@ This feature is especially handy if you're using displaCy to compare performance
 at different stages of a process, e.g. during training. Here you could use the
 title for a brief description of the text example and the number of iterations.
 
+## Visualizing spans {#span}
+
+The span visualizer, `span`, highlights overlapping spans in a text.
+
+```python
+### Span example
+import spacy
+from spacy import displacy
+from spacy.tokens import Span
+
+text = "Welcome to the Bank of China."
+
+nlp = spacy.blank("en")
+doc = nlp(text)
+
+doc.spans["sc"] = [
+    Span(doc, 3, 6, "ORG"), 
+    Span(doc, 5, 6, "GPE"),
+]
+
+displacy.serve(doc, style="span")
+```
+
+import DisplacySpanHtml from 'images/displacy-span.html'
+
+<Iframe title="displaCy visualizer for entities" html={DisplacySpanHtml} height={180} />
+
+
+The span visualizer lets you customize the following `options`:
+
+| Argument | Description                                                                                                                                             |
+|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `spans_key`       | Which spans key to render spans from. Default is `"sc"`. ~~str~~                                                                                                   |
+| `templates`       | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ |
+| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~                    |
+| `colors`          | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
+
+Because spans can be stored across different keys in `doc.spans`, you need to specify
+which one displaCy should use with `spans_key` (`sc` is the default). 
+
+> #### Options example
+>
+> ```python
+> doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
+> options = {"spans_key": "custom"}
+> displacy.serve(doc, style="span", options=options)
+
+import DisplacySpanCustomHtml from 'images/displacy-span-custom.html'
+
+<Iframe title="displaCy visualizer for spans (custom spans_key)" html={DisplacySpanCustomHtml} height={225} />
+
+
+
 ## Using displaCy in Jupyter notebooks {#jupyter}
 
 displaCy is able to detect whether you're working in a

From 3ff5a6a5c06c0010420de2ea279a86150578ef9a Mon Sep 17 00:00:00 2001
From: Grey Murav <65895033+gremur@users.noreply.github.com>
Date: Wed, 16 Mar 2022 20:25:42 +0300
Subject: [PATCH 068/123] Extend list of _num_words (#10468)

---
 spacy/lang/ru/lex_attrs.py | 144 +++++++++++++++++++++++++++++++------
 1 file changed, 122 insertions(+), 22 deletions(-)

diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py
index 90802cb9b..2afe47623 100644
--- a/spacy/lang/ru/lex_attrs.py
+++ b/spacy/lang/ru/lex_attrs.py
@@ -6,68 +6,123 @@ _num_words = list(
         """
 ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми 
 
-один первого первому единица одного одному первой первом первый первым одним одном во-первых 
+четверть четверти четвертью четвертей четвертям четвертями четвертях 
 
-два второго второму второй втором вторым двойка двумя двум двух во-вторых двое две двоих оба обе обеим обеими 
-обеих обоим обоими обоих 
+треть трети третью третей третям третями третях 
+
+половина половины половине половину половиной половин половинам половинами половинах половиною 
+
+один одного одному одним одном 
+первой первого первому первом первый первым первых 
+во-первых 
+единица единицы единице единицу единицей единиц единицам единицами единицах единицею 
+
+два двумя двум двух двоих двое две 
+второго второму второй втором вторым вторых 
+двойка двойки двойке двойку двойкой двоек двойкам двойками двойках двойкою  
+во-вторых 
+оба обе обеим обеими обеих обоим обоими обоих 
 
 полтора полторы полутора 
 
-три третьего третьему третьем третьим третий тройка трешка трёшка трояк трёха треха тремя трем трех трое троих трёх 
+три третьего третьему третьем третьим третий тремя трем трех трое троих трёх 
+тройка тройки тройке тройку тройкою троек тройкам тройками тройках тройкой 
+троечка троечки троечке троечку троечкой троечек троечкам троечками троечках троечкой 
+трешка трешки трешке трешку трешкой трешек трешкам трешками трешках трешкою 
+трёшка трёшки трёшке трёшку трёшкой трёшек трёшкам трёшками трёшках трёшкою 
+трояк трояка трояку трояком трояке трояки трояков троякам трояками трояках  
+треха треху трехой 
+трёха трёху трёхой 
+втроем втроём 
 
 четыре четвертого четвертому четвертом четвертый четвертым четверка четырьмя четырем четырех четверо четырёх четверым 
 четверых 
+вчетвером 
 
-пять пятерочка пятерка пятого пятому пятом пятый пятым пятью пяти пятеро пятерых пятерыми 
+пять пятого пятому пятом пятый пятым пятью пяти пятеро пятерых пятерыми 
+впятером 
+пятерочка пятерочки пятерочке пятерочками пятерочкой пятерочку пятерочкой пятерочками 
+пятёрочка пятёрочки пятёрочке пятёрочками пятёрочкой пятёрочку пятёрочкой пятёрочками 
+пятерка пятерки пятерке пятерками пятеркой пятерку пятерками 
+пятёрка пятёрки пятёрке пятёрками пятёркой пятёрку пятёрками 
+пятёра пятёры пятёре пятёрами пятёрой пятёру пятёрами 
+пятера пятеры пятере пятерами пятерой пятеру пятерами 
+пятак пятаки пятаке пятаками пятаком пятаку пятаками 
 
 шесть шестерка шестого шестому шестой шестом шестым шестью шести шестеро шестерых 
+вшестером 
 
-семь семерка седьмого седьмому седьмой седьмом седьмым семью семи семеро
+семь семерка седьмого седьмому седьмой седьмом седьмым семью семи семеро седьмых 
+всемером 
 
 восемь восьмерка восьмого восьмому восемью восьмой восьмом восьмым восеми восьмером восьми восьмью 
+восьмерых 
+ввосьмером 
 
 девять девятого девятому девятка девятом девятый девятым девятью девяти девятером вдевятером девятерых 
+вдевятером 
 
-десять десятого десятому десятка десятом десятый десятым десятью десяти десятером вдесятером 
+десять десятого десятому десятка десятом десятый десятым десятью десяти десятером десятых 
+вдесятером 
 
 одиннадцать одиннадцатого одиннадцатому одиннадцатом одиннадцатый одиннадцатым одиннадцатью одиннадцати 
+одиннадцатых 
 
 двенадцать двенадцатого двенадцатому двенадцатом двенадцатый двенадцатым двенадцатью двенадцати 
+двенадцатых 
 
 тринадцать тринадцатого тринадцатому тринадцатом тринадцатый тринадцатым тринадцатью тринадцати 
+тринадцатых 
 
 четырнадцать четырнадцатого четырнадцатому четырнадцатом четырнадцатый четырнадцатым четырнадцатью четырнадцати 
+четырнадцатых 
 
 пятнадцать пятнадцатого пятнадцатому пятнадцатом пятнадцатый пятнадцатым пятнадцатью пятнадцати 
+пятнадцатых 
+пятнарик пятнарику пятнариком пятнарики 
 
 шестнадцать шестнадцатого шестнадцатому шестнадцатом шестнадцатый шестнадцатым шестнадцатью шестнадцати 
+шестнадцатых 
 
-семнадцать семнадцатого семнадцатому семнадцатом семнадцатый семнадцатым семнадцатью семнадцати 
+семнадцать семнадцатого семнадцатому семнадцатом семнадцатый семнадцатым семнадцатью семнадцати семнадцатых 
 
 восемнадцать восемнадцатого восемнадцатому восемнадцатом восемнадцатый восемнадцатым восемнадцатью восемнадцати 
+восемнадцатых 
 
 девятнадцать девятнадцатого девятнадцатому девятнадцатом девятнадцатый девятнадцатым девятнадцатью девятнадцати 
+девятнадцатых 
 
-двадцать двадцатого двадцатому двадцатом двадцатый двадцатым двадцатью двадцати 
+двадцать двадцатого двадцатому двадцатом двадцатый двадцатым двадцатью двадцати двадцатых 
 
-тридцать тридцатого тридцатому тридцатом тридцатый тридцатым тридцатью тридцати 
+четвертак четвертака четвертаке четвертаку четвертаки четвертаком четвертаками 
 
-тридевять
+тридцать тридцатого тридцатому тридцатом тридцатый тридцатым тридцатью тридцати тридцатых 
+тридцадка тридцадку тридцадке тридцадки тридцадкой тридцадкою тридцадками 
 
-сорок сорокового сороковому сороковом сороковым сороковой 
+тридевять тридевяти тридевятью 
 
-пятьдесят пятьдесятого пятьдесятому пятьюдесятью пятьдесятом пятьдесятый пятьдесятым пятидесяти полтинник 
+сорок сорокового сороковому сороковом сороковым сороковой сороковых 
+сорокет сорокета сорокету сорокете сорокеты сорокетом сорокетами сорокетам 
+
+пятьдесят пятьдесятого пятьдесятому пятьюдесятью пятьдесятом пятьдесятый пятьдесятым пятидесяти пятьдесятых 
+полтинник полтинника полтиннике полтиннику полтинники полтинником полтинниками полтинникам полтинниках 
+пятидесятка пятидесятке пятидесятку пятидесятки пятидесяткой пятидесятками пятидесяткам пятидесятках 
+полтос полтоса полтосе полтосу полтосы полтосом полтосами полтосам полтосах 
 
 шестьдесят шестьдесятого шестьдесятому шестьюдесятью шестьдесятом шестьдесятый шестьдесятым шестидесятые шестидесяти 
+шестьдесятых 
 
-семьдесят семьдесятого семьдесятому семьюдесятью семьдесятом семьдесятый семьдесятым семидесяти 
+семьдесят семьдесятого семьдесятому семьюдесятью семьдесятом семьдесятый семьдесятым семидесяти семьдесятых 
 
 восемьдесят восемьдесятого восемьдесятому восемьюдесятью восемьдесятом восемьдесятый восемьдесятым восемидесяти 
-восьмидесяти 
+восьмидесяти восьмидесятых 
 
-девяносто девяностого девяностому девяностом девяностый девяностым девяноста 
+девяносто девяностого девяностому девяностом девяностый девяностым девяноста девяностых 
 
-сто сотого сотому сотка сотня сотом сотен сотый сотым ста 
+сто сотого сотому сотом сотен сотый сотым ста 
+стольник стольника стольнику стольнике стольники стольником стольниками 
+сотка сотки сотке соткой сотками соткам сотках 
+сотня сотни сотне сотней сотнями сотням сотнях 
 
 двести двумястами двухсотого двухсотому двухсотом двухсотый двухсотым двумстам двухстах двухсот 
 
@@ -77,6 +132,9 @@ _num_words = list(
 четырехсот 
 
 пятьсот пятисотого пятисотому пятьюстами пятисотом пятисотый пятисотым пятистам пятистах пятисот 
+пятисотка пятисотки пятисотке пятисоткой пятисотками пятисоткам пятисоткою пятисотках 
+пятихатка пятихатки пятихатке пятихаткой пятихатками пятихаткам пятихаткою пятихатках 
+пятифан пятифаны пятифане пятифаном пятифанами пятифанах 
 
 шестьсот шестисотого шестисотому шестьюстами шестисотом шестисотый шестисотым шестистам шестистах шестисот 
 
@@ -87,23 +145,65 @@ _num_words = list(
 девятьсот девятисотого девятисотому девятьюстами девятисотом девятисотый девятисотым девятистам девятистах девятисот 
 
 тысяча тысячного тысячному тысячном тысячный тысячным тысячам тысячах тысячей тысяч тысячи тыс 
+косарь косаря косару косарем косарями косарях косарям косарей 
+
+десятитысячный десятитысячного десятитысячному десятитысячным десятитысячном десятитысячная десятитысячной 
+десятитысячную десятитысячною десятитысячное десятитысячные десятитысячных десятитысячными 
+
+двадцатитысячный двадцатитысячного двадцатитысячному двадцатитысячным двадцатитысячном двадцатитысячная 
+двадцатитысячной двадцатитысячную двадцатитысячною двадцатитысячное двадцатитысячные двадцатитысячных 
+двадцатитысячными 
+
+тридцатитысячный тридцатитысячного тридцатитысячному тридцатитысячным тридцатитысячном тридцатитысячная 
+тридцатитысячной тридцатитысячную тридцатитысячною тридцатитысячное тридцатитысячные тридцатитысячных 
+тридцатитысячными 
+
+сорокатысячный сорокатысячного сорокатысячному сорокатысячным сорокатысячном сорокатысячная 
+сорокатысячной сорокатысячную сорокатысячною сорокатысячное сорокатысячные сорокатысячных 
+сорокатысячными 
+
+пятидесятитысячный пятидесятитысячного пятидесятитысячному пятидесятитысячным пятидесятитысячном пятидесятитысячная 
+пятидесятитысячной пятидесятитысячную пятидесятитысячною пятидесятитысячное пятидесятитысячные пятидесятитысячных 
+пятидесятитысячными 
+
+шестидесятитысячный шестидесятитысячного шестидесятитысячному шестидесятитысячным шестидесятитысячном шестидесятитысячная 
+шестидесятитысячной шестидесятитысячную шестидесятитысячною шестидесятитысячное шестидесятитысячные шестидесятитысячных 
+шестидесятитысячными 
+
+семидесятитысячный семидесятитысячного семидесятитысячному семидесятитысячным семидесятитысячном семидесятитысячная 
+семидесятитысячной семидесятитысячную семидесятитысячною семидесятитысячное семидесятитысячные семидесятитысячных 
+семидесятитысячными 
+
+восьмидесятитысячный восьмидесятитысячного восьмидесятитысячному восьмидесятитысячным восьмидесятитысячном восьмидесятитысячная 
+восьмидесятитысячной восьмидесятитысячную восьмидесятитысячною восьмидесятитысячное восьмидесятитысячные восьмидесятитысячных 
+восьмидесятитысячными 
+
+стотысячный стотысячного стотысячному стотысячным стотысячном стотысячная стотысячной стотысячную стотысячное 
+стотысячные стотысячных стотысячными стотысячною 
 
 миллион миллионного миллионов миллионному миллионном миллионный миллионным миллионом миллиона миллионе миллиону 
-миллионов лям млн 
+миллионов 
+лям ляма лямы лямом лямами лямах лямов 
+млн 
+
+десятимиллионная десятимиллионной десятимиллионными десятимиллионный десятимиллионным десятимиллионному 
+десятимиллионными десятимиллионную десятимиллионное  десятимиллионные десятимиллионных десятимиллионною 
 
 миллиард миллиардного миллиардному миллиардном миллиардный миллиардным миллиардом миллиарда миллиарде миллиарду 
-миллиардов лярд млрд 
+миллиардов 
+лярд лярда лярды лярдом лярдами лярдах лярдов 
+млрд 
 
 триллион триллионного триллионному триллионном триллионный триллионным триллионом триллиона триллионе триллиону 
 триллионов трлн 
 
 квадриллион квадриллионного квадриллионному квадриллионный квадриллионным квадриллионом квадриллиона квадриллионе 
-квадриллиону квадриллионов квадрлн
+квадриллиону квадриллионов квадрлн 
 
 квинтиллион квинтиллионного квинтиллионному квинтиллионный квинтиллионным квинтиллионом квинтиллиона квинтиллионе 
-квинтиллиону квинтиллионов квинтлн
+квинтиллиону квинтиллионов квинтлн 
 
-i ii iii iv vi vii viii ix xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix
+i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix
 """.split()
     )
 )

From 0b02dc4c57df5bfe4ad4ba35063f63ccd2435395 Mon Sep 17 00:00:00 2001
From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
Date: Fri, 18 Mar 2022 15:56:21 +0800
Subject: [PATCH 069/123] Fix mixed-up parameters for spacy-conll (#10516)

---
 website/meta/universe.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index a930363a4..cd8f25890 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2415,8 +2415,8 @@
                 "# Indicate that we want to get the CoNLL headers in the string output.",
                 "# `use_gpu` and `verbose` are specific to stanza (and stanfordnlp). These keywords arguments",
                 "# are passed onto their Pipeline() initialisation",
-                "nlp = init_parser(\"stanza\",",
-                "                  \"en\",",
+                "nlp = init_parser(\"en\",",
+                "                  \"stanza\",",
                 "                  parser_opts={\"use_gpu\": True, \"verbose\": False},",
                 "                  include_headers=True)",
                 "# Parse a given string",

From 04f3f414d179200ff27f7eddd2ec44891b7c5015 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 18 Mar 2022 13:43:54 +0100
Subject: [PATCH 070/123] Update pytest to forbid ==7.1.0, allow >=7.1.1
 (#10519)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4da6d5df6..7b9d343a9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,7 +26,7 @@ typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
-pytest>=5.2.0,<7.1.0
+pytest>=5.2.0,!=7.1.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<3.10.0

From bf1cf77a5ba65077965d9fcb368e72b51ad2aeeb Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 21 Mar 2022 09:21:24 +0100
Subject: [PATCH 071/123] Auto-format code with black (#10518)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 spacy/tests/tokenizer/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index ed11508b4..b27af6bcd 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -529,7 +529,7 @@ def test_tokenizer_initial_special_case_explain(en_vocab):
         token_match=re.compile("^id$").match,
         rules={
             "id": [{"ORTH": "i"}, {"ORTH": "d"}],
-        }
+        },
     )
     tokens = [t.text for t in tokenizer("id")]
     explain_tokens = [t[1] for t in tokenizer.explain("id")]

From 107bab56b505834a0b6f647e32ab34ada29c0c7b Mon Sep 17 00:00:00 2001
From: Basile Dura <bdura@users.noreply.github.com>
Date: Mon, 21 Mar 2022 11:03:39 +0100
Subject: [PATCH 072/123] docs: add EDS-NLP to spaCy universe (#10489)

* docs: add EDS-NLP to spaCy universe

* fix: remove "standalone" tag for EDS-NLP

Co-authored-by: Basile Dura <basile.dura-ext@aphp.fr>
---
 website/meta/universe.json | 42 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index cd8f25890..eb4508498 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -3882,6 +3882,48 @@
                 "twitter": "_knrd"
             },
             "category": ["scientific", "standalone"]
+        },
+        {
+            "id": "edsnlp",
+            "title": "EDS-NLP",
+            "slogan": "spaCy components to extract information from clinical notes written in French.",
+            "description": "EDS-NLP provides a set of rule-based spaCy components to extract information for French clinical notes. It also features _qualifier_ pipelines that detect negations, speculations and family context, among other modalities. Check out the [demo](https://aphp.github.io/edsnlp/demo/)!",
+            "github": "aphp/edsnlp",
+            "pip": "edsnlp",
+            "code_example": [
+                "import spacy",
+                "",
+                "nlp = spacy.blank(\"fr\")",
+                "",
+                "terms = dict(",
+                "    covid=[\"covid\", \"coronavirus\"],",
+                ")",
+                "",
+                "# Sentencizer component, needed for negation detection",
+                "nlp.add_pipe(\"eds.sentences\")",
+                "# Matcher component",
+                "nlp.add_pipe(\"eds.matcher\", config=dict(terms=terms))",
+                "# Negation detection",
+                "nlp.add_pipe(\"eds.negation\")",
+                "",
+                "# Process your text in one call !",
+                "doc = nlp(\"Le patient est atteint de covid\")",
+                "",
+                "doc.ents",
+                "# Out: (covid,)",
+                "",
+                "doc.ents[0]._.negation",
+                "# Out: False"
+            ],
+            "code_language": "python",
+            "url": "https://aphp.github.io/edsnlp/",
+            "author": "AP-HP",
+            "author_links": {
+                "github": "aphp",
+                "website": "https://github.com/aphp"
+            },
+            "category": ["biomedical", "scientific", "research", "pipeline"],
+            "tags": ["clinical"]
         }
     ],
 

From c17980e535a8009b14ee4d1f818db207d9c07e55 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Mar 2022 14:24:46 +0100
Subject: [PATCH 073/123] Save vectors as little endian, load with Ops.asarray
 (#10201)

* Save vectors as little endian, load with Ops.asarray

* Always save vector data as little endian
* Always run `Vectors.to_ops` when vector data is loaded so that
  `Ops.asarray` can be used to load the data correctly for the current
  ops.

* Update spacy/vectors.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/vectors.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/vectors.pyx | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 2b1ea764b..bcba9d03f 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -565,8 +565,9 @@ cdef class Vectors:
             # the source of numpy.save indicates that the file object is closed after use.
             # but it seems that somehow this does not happen, as ResourceWarnings are raised here.
             # in order to not rely on this, wrap in context manager.
+            ops = get_current_ops()
             with path.open("wb") as _file:
-                save_array(self.data, _file)
+                save_array(ops.to_numpy(self.data, byte_order="<"), _file)
 
         serializers = {
             "strings": lambda p: self.strings.to_disk(p.with_suffix(".json")),
@@ -602,6 +603,7 @@ cdef class Vectors:
             ops = get_current_ops()
             if path.exists():
                 self.data = ops.xp.load(str(path))
+            self.to_ops(ops)
 
         def load_settings(path):
             if path.exists():
@@ -631,7 +633,8 @@ cdef class Vectors:
             if hasattr(self.data, "to_bytes"):
                 return self.data.to_bytes()
             else:
-                return srsly.msgpack_dumps(self.data)
+                ops = get_current_ops()
+                return srsly.msgpack_dumps(ops.to_numpy(self.data, byte_order="<"))
 
         serializers = {
             "strings": lambda: self.strings.to_bytes(),
@@ -656,6 +659,8 @@ cdef class Vectors:
             else:
                 xp = get_array_module(self.data)
                 self.data = xp.asarray(srsly.msgpack_loads(b))
+                ops = get_current_ops()
+                self.to_ops(ops)
 
         deserializers = {
             "strings": lambda b: self.strings.from_bytes(b),

From e908a67829e546d1dc9f93aa409b8bcf8939f758 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Mar 2022 11:25:36 +0100
Subject: [PATCH 074/123] Handle unknown tags in KoreanTokenizer tag map
 (#10536)

---
 spacy/lang/ko/__init__.py             | 7 +++++--
 spacy/tests/lang/ko/test_tokenizer.py | 6 ++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index 63bc06665..0e02e4a2d 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -7,7 +7,7 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language, BaseDefaults
 from ...tokens import Doc
 from ...scorer import Scorer
-from ...symbols import POS
+from ...symbols import POS, X
 from ...training import validate_examples
 from ...util import DummyTokenizer, registry, load_config_from_str
 from ...vocab import Vocab
@@ -57,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
         for token, dtoken in zip(doc, dtokens):
             first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
             token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
-            token.pos = TAG_MAP[token.tag_][POS]
+            if token.tag_ in TAG_MAP:
+                token.pos = TAG_MAP[token.tag_][POS]
+            else:
+                token.pos = X
             token.lemma_ = dtoken["lemma"]
         doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
         return doc
diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py
index e6b65dee9..6e06e405e 100644
--- a/spacy/tests/lang/ko/test_tokenizer.py
+++ b/spacy/tests/lang/ko/test_tokenizer.py
@@ -49,6 +49,12 @@ def test_ko_empty_doc(ko_tokenizer):
     assert len(tokens) == 0
 
 
+@pytest.mark.issue(10535)
+def test_ko_tokenizer_unknown_tag(ko_tokenizer):
+    tokens = ko_tokenizer("미닛 리피터")
+    assert tokens[1].pos_ == "X"
+
+
 # fmt: off
 SPACY_TOKENIZER_TESTS = [
     ("있다.", "있다 ."),

From d85117f88c689c8914be5386d15b63a3330a9124 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Mar 2022 11:47:05 +0100
Subject: [PATCH 075/123] Stream large assets on download (#10521)

Stream large assets on download rather than reading the whole file at
once and potentially running into `urllib3` limits on single read sizes.
---
 spacy/cli/_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index fb680d888..df98e711f 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -360,7 +360,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
     src = str(src)
     with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
         with dest.open(mode="wb") as output_file:
-            output_file.write(input_file.read())
+            shutil.copyfileobj(input_file, output_file)
 
 
 def ensure_pathy(path):

From 2ff197603e850d87131ae7825ed116851b9f0a93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 24 Mar 2022 11:48:22 +0100
Subject: [PATCH 076/123] matcher: remove an undefined behavior (#10537)

Indexing into a zero-length std::vector is an undefined behavior.
---
 spacy/matcher/matcher.pyx | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 6aa58f0e3..e75ee9ce2 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -244,8 +244,12 @@ cdef class Matcher:
                         pipe = "parser"
                     error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
                     raise ValueError(error_msg)
-        matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
-                                extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
+
+        if self.patterns.empty():
+            matches = []
+        else:
+            matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
+                                    extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
         final_matches = []
         pairs_by_id = {}
         # For each key, either add all matches, or only the filtered,

From 31a5d99efa39ad14fdefe4b48bb45d3f4d97c8fa Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Mar 2022 11:51:07 +0100
Subject: [PATCH 077/123] Maintain support for empty DocBin span groups
 (#10538)

---
 spacy/tokens/_serialize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 2b72adb4d..c4e8f26f4 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -147,7 +147,8 @@ class DocBin:
             doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)  # type: ignore
             doc = doc.from_array(self.attrs, tokens)  # type: ignore
             doc.cats = self.cats[i]
-            if self.span_groups[i] != SpanGroups._EMPTY_BYTES:
+            # backwards-compatibility: may be b'' or serialized empty list
+            if self.span_groups[i] and self.span_groups[i] != SpanGroups._EMPTY_BYTES:
                 doc.spans.from_bytes(self.span_groups[i])
             else:
                 doc.spans.clear()

From 3711af74e5ca59c6406f3041d54cd29a06b4ca26 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Mar 2022 13:21:32 +0100
Subject: [PATCH 078/123] Add tokenizer option to allow Matcher handling for
 all rules (#10452)

* Add tokenizer option to allow Matcher handling for all rules

Add tokenizer option `with_faster_rules_heuristics` that determines
whether the special cases applied by the internal `Matcher` are filtered
by whether they contain affixes or space. If `True` (default), the rules
are filtered to prioritize speed over rare edge cases. If `False`, all
rules are included in the final `Matcher`-based pass over the doc.

* Reset all caches when reloading special cases

* Revert "Reset all caches when reloading special cases"

This reverts commit 4ef6bd171d00da01cbabc3bcde00088ba4bd5578.

* Initialize max_length properly

* Add new tag to API docs

* Rename to faster heuristics
---
 .../serialize/test_serialize_tokenizer.py     |  2 ++
 spacy/tests/tokenizer/test_tokenizer.py       | 19 ++++++++++++-
 spacy/tokenizer.pxd                           |  5 ++--
 spacy/tokenizer.pyx                           | 28 +++++++++++++++----
 website/docs/api/tokenizer.md                 | 19 +++++++------
 5 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py
index e271f7707..9b74d7721 100644
--- a/spacy/tests/serialize/test_serialize_tokenizer.py
+++ b/spacy/tests/serialize/test_serialize_tokenizer.py
@@ -70,6 +70,7 @@ def test_issue4190():
             suffix_search=suffix_re.search,
             infix_finditer=infix_re.finditer,
             token_match=nlp.tokenizer.token_match,
+            faster_heuristics=False,
         )
         nlp.tokenizer = new_tokenizer
 
@@ -90,6 +91,7 @@ def test_issue4190():
     doc_2 = nlp_2(test_string)
     result_2 = [token.text for token in doc_2]
     assert result_1b == result_2
+    assert nlp_2.tokenizer.faster_heuristics is False
 
 
 def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index b27af6bcd..c661e91f7 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -523,6 +523,23 @@ def test_tokenizer_infix_prefix(en_vocab):
     assert tokens == explain_tokens
 
 
+@pytest.mark.issue(10086)
+def test_issue10086(en_tokenizer):
+    """Test special case works when part of infix substring."""
+    text = "No--don't see"
+
+    # without heuristics: do n't
+    en_tokenizer.faster_heuristics = False
+    doc = en_tokenizer(text)
+    assert "n't" in [w.text for w in doc]
+    assert "do" in [w.text for w in doc]
+
+    # with (default) heuristics: don't
+    en_tokenizer.faster_heuristics = True
+    doc = en_tokenizer(text)
+    assert "don't" in [w.text for w in doc]
+
+
 def test_tokenizer_initial_special_case_explain(en_vocab):
     tokenizer = Tokenizer(
         en_vocab,
@@ -533,4 +550,4 @@ def test_tokenizer_initial_special_case_explain(en_vocab):
     )
     tokens = [t.text for t in tokenizer("id")]
     explain_tokens = [t[1] for t in tokenizer.explain("id")]
-    assert tokens == explain_tokens
+    assert tokens == explain_tokens
\ No newline at end of file
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index fa38a1015..e6a072053 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -23,9 +23,10 @@ cdef class Tokenizer:
     cdef object _infix_finditer
     cdef object _rules
     cdef PhraseMatcher _special_matcher
-    # TODO next two are unused and should be removed in v4
+    # TODO convert to bool in v4
+    cdef int _faster_heuristics
+    # TODO next one is unused and should be removed in v4
     # https://github.com/explosion/spaCy/pull/9150
-    cdef int _unused_int1
     cdef int _unused_int2
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index ac55a61f3..0e75b5f7a 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -34,7 +34,7 @@ cdef class Tokenizer:
     """
     def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                  suffix_search=None, infix_finditer=None, token_match=None,
-                 url_match=None):
+                 url_match=None, faster_heuristics=True):
         """Create a `Tokenizer`, to create `Doc` objects given unicode text.
 
         vocab (Vocab): A storage container for lexical types.
@@ -43,7 +43,7 @@ cdef class Tokenizer:
             `re.compile(string).search` to match prefixes.
         suffix_search (callable): A function matching the signature of
             `re.compile(string).search` to match suffixes.
-        `infix_finditer` (callable): A function matching the signature of
+        infix_finditer (callable): A function matching the signature of
             `re.compile(string).finditer` to find infixes.
         token_match (callable): A function matching the signature of
             `re.compile(string).match`, for matching strings to be
@@ -51,6 +51,9 @@ cdef class Tokenizer:
         url_match (callable): A function matching the signature of
             `re.compile(string).match`, for matching strings to be
             recognized as urls.
+        faster_heuristics (bool): Whether to restrict the final
+            Matcher-based pass for rules to those containing affixes or space.
+            Defaults to True.
 
         EXAMPLE:
             >>> tokenizer = Tokenizer(nlp.vocab)
@@ -66,6 +69,7 @@ cdef class Tokenizer:
         self.suffix_search = suffix_search
         self.infix_finditer = infix_finditer
         self.vocab = vocab
+        self.faster_heuristics = faster_heuristics
         self._rules = {}
         self._special_matcher = PhraseMatcher(self.vocab)
         self._load_special_cases(rules)
@@ -122,6 +126,14 @@ cdef class Tokenizer:
             self._specials = PreshMap()
             self._load_special_cases(rules)
 
+    property faster_heuristics:
+        def __get__(self):
+            return bool(self._faster_heuristics)
+
+        def __set__(self, faster_heuristics):
+            self._faster_heuristics = bool(faster_heuristics)
+            self._reload_special_cases()
+
     def __reduce__(self):
         args = (self.vocab,
                 self.rules,
@@ -287,7 +299,7 @@ cdef class Tokenizer:
         spans = [doc[match.start:match.end] for match in filtered]
         cdef bint modify_in_place = True
         cdef int curr_length = doc.length
-        cdef int max_length
+        cdef int max_length = 0
         cdef int span_length_diff = 0
         span_data = {}
         for span in spans:
@@ -602,7 +614,7 @@ cdef class Tokenizer:
             self.mem.free(stale_special)
         self._rules[string] = substrings
         self._flush_cache()
-        if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
+        if not self.faster_heuristics or self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
             self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
 
     def _reload_special_cases(self):
@@ -777,7 +789,8 @@ cdef class Tokenizer:
             "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
             "token_match": lambda: _get_regex_pattern(self.token_match),
             "url_match": lambda: _get_regex_pattern(self.url_match),
-            "exceptions": lambda: dict(sorted(self._rules.items()))
+            "exceptions": lambda: dict(sorted(self._rules.items())),
+            "faster_heuristics": lambda: self.faster_heuristics,
         }
         return util.to_bytes(serializers, exclude)
 
@@ -798,7 +811,8 @@ cdef class Tokenizer:
             "infix_finditer": lambda b: data.setdefault("infix_finditer", b),
             "token_match": lambda b: data.setdefault("token_match", b),
             "url_match": lambda b: data.setdefault("url_match", b),
-            "exceptions": lambda b: data.setdefault("rules", b)
+            "exceptions": lambda b: data.setdefault("rules", b),
+            "faster_heuristics": lambda b: data.setdefault("faster_heuristics", b),
         }
         # reset all properties and flush all caches (through rules),
         # reset rules first so that _reload_special_cases is trivial/fast as
@@ -822,6 +836,8 @@ cdef class Tokenizer:
             self.url_match = re.compile(data["url_match"]).match
         if "rules" in data and isinstance(data["rules"], dict):
             self.rules = data["rules"]
+        if "faster_heuristics" in data:
+            self.faster_heuristics = data["faster_heuristics"]
         return self
 
 
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index 8809c10bc..6eb7e8024 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -44,15 +44,16 @@ how to construct a custom tokenizer with different tokenization rules, see the
 > tokenizer = nlp.tokenizer
 > ```
 
-| Name             | Description                                                                                                                                                                   |
-| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | A storage container for lexical types. ~~Vocab~~                                                                                                                              |
-| `rules`          | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~                                                                                 |
-| `prefix_search`  | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
-| `suffix_search`  | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
-| `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~                                            |
-| `token_match`    | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~                                         |
-| `url_match`      | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
+| Name                                               | Description                                                                                                                                                                   |
+| -------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                            | A storage container for lexical types. ~~Vocab~~                                                                                                                              |
+| `rules`                                            | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~                                                                                 |
+| `prefix_search`                                    | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
+| `suffix_search`                                    | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
+| `infix_finditer`                                   | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~                                            |
+| `token_match`                                      | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~                                         |
+| `url_match`                                        | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
+| `faster_heuristics` <Tag variant="new">3.3.0</Tag> | Whether to restrict the final `Matcher`-based pass for rules to those containing affixes or space. Defaults to `True`. ~~bool~~                                               |
 
 ## Tokenizer.\_\_call\_\_ {#call tag="method"}
 

From ed2ac34a8aae6e0a0e9281aef1e83c195f19d398 Mon Sep 17 00:00:00 2001
From: David Berenstein <david.m.berenstein@gmail.com>
Date: Thu, 24 Mar 2022 18:00:12 +0100
Subject: [PATCH 079/123] added Concise Concepts to spaCy universe (#10499)

* Update universe.json

added classy-classification to Spacy universe

* Update universe.json

added classy-classification to the spacy universe resources

* Update universe.json

corrected a small typo in json

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update universe.json

processed merge feedback

* Update universe.json

* updated information for Classy Classificaiton

Made a more comprehensible and easy description for Classy Classification based on feedback of Philip Vollet to prepare for sharing.

* added note about examples

* corrected for wrong formatting changes

* Update website/meta/universe.json with small typo correction

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* resolved another typo

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* added Concise Concepts package to spaCy universe.

* updated example code Concise Concepts

* updated description for Concise Concepts

* updated PR with more visually appealing examples

SO to koaning for the suggestions.

* corrected for small json typo's in concise concepts

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/meta/universe.json | 54 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index eb4508498..be95ac083 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2606,6 +2606,7 @@
             "description": "Have you ever struggled with needing a [spaCy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Huggingface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification).",
             "github": "davidberenstein1957/classy-classification",
             "pip": "classy-classification",
+            "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/classy-classification/master/logo.png",
             "code_example": [
                 "import spacy",
                 "import classy_classification",
@@ -2651,6 +2652,59 @@
             ],
             "spacy_version": 3
         },
+        {
+            "id": "conciseconcepts",
+            "title": "Concise Concepts",
+            "slogan": "Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
+            "description": "When wanting to apply NER to concise concepts, it is really easy to come up with examples, but it takes some effort to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy!",
+            "github": "pandora-intelligence/concise-concepts",
+            "pip": "concise-concepts",
+            "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/logo.png",
+            "image": "https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png",
+            "code_example": [
+                "import spacy",
+                "from spacy import displacy",
+                "import concise_concepts",
+                "",
+                "data = {",
+                "    \"fruit\": [\"apple\", \"pear\", \"orange\"],",
+                "    \"vegetable\": [\"broccoli\", \"spinach\", \"tomato\"],",
+                "    \"meat\": [\"beef\", \"pork\", \"fish\", \"lamb\"]",
+                "}",
+                "",
+                "text = \"\"\"",
+                "    Heat the oil in a large pan and add the Onion, celery and carrots.",
+                "    Then, cook over a medium–low heat for 10 minutes, or until softened.",
+                "    Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes.",
+                "    Later, add some oranges and chickens.\"\"\"",
+                "",
+                "# use any model that has internal spacy embeddings", 
+                "nlp = spacy.load('en_core_web_lg')",
+                "nlp.add_pipe(\"concise_concepts\", ",
+                "    config={\"data\": data}",
+                ")",
+                "doc = nlp(text)",
+                "",
+                "options = {\"colors\": {\"fruit\": \"darkorange\", \"vegetable\": \"limegreen\", \"meat\": \"salmon\"},",
+                "           \"ents\": [\"fruit\", \"vegetable\", \"meat\"]}",
+                "",
+                "displacy.render(doc, style=\"ent\", options=options)"
+            ],
+            "author": "David Berenstein",
+            "author_links": {
+                "github": "davidberenstein1957",
+                "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
+            },
+            "category": [
+                "pipeline"
+            ],
+            "tags": [
+                "ner",
+                "few-shot",
+                "gensim"
+            ],
+            "spacy_version": 3
+        },
         {
             "id": "blackstone",
             "title": "Blackstone",

From 33eb63b157ab605704a4a80b67baebf8504fcb03 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 25 Mar 2022 14:26:08 +0100
Subject: [PATCH 080/123] Remove now-built-in jinja2>=3.1.0 extensions

---
 website/setup/jinja_to_js.py   | 1 -
 website/setup/requirements.txt | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/website/setup/jinja_to_js.py b/website/setup/jinja_to_js.py
index e2eca7ffb..3e1963ff7 100644
--- a/website/setup/jinja_to_js.py
+++ b/website/setup/jinja_to_js.py
@@ -206,7 +206,6 @@ class JinjaToJS(object):
         self.environment = Environment(
             loader=FileSystemLoader(template_root),
             autoescape=True,
-            extensions=["jinja2.ext.with_", "jinja2.ext.autoescape"],
         )
         self.output = StringIO()
         self.stored_names = set()
diff --git a/website/setup/requirements.txt b/website/setup/requirements.txt
index e7a8e65a7..cbd306cc3 100644
--- a/website/setup/requirements.txt
+++ b/website/setup/requirements.txt
@@ -1,3 +1,3 @@
 # These are used to compile the training quickstart config
-jinja2
+jinja2>=3.1.0
 srsly

From d5666fd12d70a0a9f50bb75e2941ca59f2582a98 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 28 Mar 2022 10:35:47 +0200
Subject: [PATCH 081/123] Add NORM to Matcher feature in docs (#10560)

---
 website/docs/api/matcher.md               | 1 +
 website/docs/usage/rule-based-matching.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 3e7f9dc04..273c202ca 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -34,6 +34,7 @@ rule-based matching are:
 | ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
 | `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               |
 | `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               |
+| `NORM`                                          | The normalized form of the token text. ~~str~~                                                                            |
 | `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             |
 |  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     |
 |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 74bb10304..710c52dfd 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -162,6 +162,7 @@ rule-based matching are:
 | ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                                                                                                                                                                                                               |
 | `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                                                                                                                                                                                                               |
+| `NORM`                                          | The normalized form of the token text. ~~str~~                                                                                                                                                                                                                                                            |
 | `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                                                                                                                                                                                                             |
 |  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                                                                                                                                                                                                     |
 |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                                                                                                                                                                                                          |

From 53674bb745362bcf645ad1bcbb4b485115c7d853 Mon Sep 17 00:00:00 2001
From: Luka Dragar <lukc.orglce@gmail.com>
Date: Mon, 28 Mar 2022 10:44:10 +0200
Subject: [PATCH 082/123] Examples for Slovene (#10539)

* Added examples for Slovene

* Update spacy/lang/sl/examples.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Corrected a typo in one of the sentences

Co-authored-by: Luka Dragar <D20124481@mytudublin.ie>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/lang/sl/examples.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 spacy/lang/sl/examples.py

diff --git a/spacy/lang/sl/examples.py b/spacy/lang/sl/examples.py
new file mode 100644
index 000000000..bf483c6a4
--- /dev/null
+++ b/spacy/lang/sl/examples.py
@@ -0,0 +1,18 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.sl.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple načrtuje nakup britanskega startupa za 1 bilijon dolarjev",
+    "France Prešeren je umrl 8. februarja 1849 v Kranju",
+    "Staro ljubljansko letališče Moste bo obnovila družba BTC",
+    "London je največje mesto v Združenem kraljestvu.",
+    "Kje se skrivaš?",
+    "Kdo je predsednik Francije?",
+    "Katero je glavno mesto Združenih držav Amerike?",
+    "Kdaj je bil rojen Milan Kučan?",
+]

From 98ed941c39f7cd1f904bf62d4be244309ebbd8e5 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 28 Mar 2022 10:44:46 +0200
Subject: [PATCH 083/123] Auto-format code with black (#10550)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 spacy/tests/tokenizer/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index c661e91f7..6af58b344 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -550,4 +550,4 @@ def test_tokenizer_initial_special_case_explain(en_vocab):
     )
     tokens = [t.text for t in tokenizer("id")]
     explain_tokens = [t[1] for t in tokenizer.explain("id")]
-    assert tokens == explain_tokens
\ No newline at end of file
+    assert tokens == explain_tokens

From 85778dfcf411dcf2cef305feb661f6fc51abacde Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 28 Mar 2022 11:13:50 +0200
Subject: [PATCH 084/123] Add edit tree lemmatizer (#10231)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add edit tree lemmatizer

Co-authored-by: Daniël de Kok <me@danieldk.eu>

* Hide edit tree lemmatizer labels

* Use relative imports

* Switch to single quotes in error message

* Type annotation fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Reformat edit_tree_lemmatizer with black

* EditTreeLemmatizer.predict: take Iterable

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Validate edit trees during deserialization

This change also changes the serialized representation. Rather than
mirroring the deep C structure, we use a simple flat union of the match
and substitution node types.

* Move edit_trees to _edit_tree_internals

* Fix invalid edit tree format error message

* edit_tree_lemmatizer: remove outdated TODO comment

* Rename factory name to trainable_lemmatizer

* Ignore type instead of casting truths to List[Union[Ints1d, Floats2d, List[int], List[str]]] for thinc v8.0.14

* Switch to Tagger.v2

* Add documentation for EditTreeLemmatizer

* docs: Fix 3.2 -> 3.3 somewhere

* trainable_lemmatizer documentation fixes

* docs: EditTreeLemmatizer is in edit_tree_lemmatizer.py

Co-authored-by: Daniël de Kok <me@danieldk.eu>
Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 setup.py                                      |   1 +
 spacy/errors.py                               |   2 +
 spacy/pipeline/__init__.py                    |   1 +
 .../pipeline/_edit_tree_internals/__init__.py |   0
 .../_edit_tree_internals/edit_trees.pxd       |  93 ++++
 .../_edit_tree_internals/edit_trees.pyx       | 305 +++++++++++++
 .../pipeline/_edit_tree_internals/schemas.py  |  44 ++
 spacy/pipeline/edit_tree_lemmatizer.py        | 379 ++++++++++++++++
 .../pipeline/test_edit_tree_lemmatizer.py     | 280 ++++++++++++
 website/docs/api/edittreelemmatizer.md        | 409 ++++++++++++++++++
 website/docs/api/lemmatizer.md                |   7 +-
 website/docs/usage/101/_architecture.md       |   3 +-
 website/docs/usage/linguistic-features.md     |  31 +-
 website/docs/usage/processing-pipelines.md    |  33 +-
 website/meta/sidebars.json                    |   1 +
 15 files changed, 1562 insertions(+), 27 deletions(-)
 create mode 100644 spacy/pipeline/_edit_tree_internals/__init__.py
 create mode 100644 spacy/pipeline/_edit_tree_internals/edit_trees.pxd
 create mode 100644 spacy/pipeline/_edit_tree_internals/edit_trees.pyx
 create mode 100644 spacy/pipeline/_edit_tree_internals/schemas.py
 create mode 100644 spacy/pipeline/edit_tree_lemmatizer.py
 create mode 100644 spacy/tests/pipeline/test_edit_tree_lemmatizer.py
 create mode 100644 website/docs/api/edittreelemmatizer.md

diff --git a/setup.py b/setup.py
index fcc124a43..a5748e9b4 100755
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@ MOD_NAMES = [
     "spacy.ml.parser_model",
     "spacy.morphology",
     "spacy.pipeline.dep_parser",
+    "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
     "spacy.pipeline.multitask",
     "spacy.pipeline.ner",
diff --git a/spacy/errors.py b/spacy/errors.py
index fe37351f7..8980ca3c3 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -524,6 +524,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
+    E857 = ("Entry '{name}' not found in edit tree lemmatizer labels.")
     E858 = ("The {mode} vector table does not support this operation. "
             "{alternative}")
     E859 = ("The floret vector table cannot be modified.")
@@ -895,6 +896,7 @@ class Errors(metaclass=ErrorsWithCodes):
              "patterns.")
     E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
              "supported values are: 'I', 'O', 'B' and ''")
+    E1026 = ("Edit tree has an invalid format:\n{errors}")
     
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 7b483724c..938ab08c6 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,5 +1,6 @@
 from .attributeruler import AttributeRuler
 from .dep_parser import DependencyParser
+from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
 from .entityruler import EntityRuler
diff --git a/spacy/pipeline/_edit_tree_internals/__init__.py b/spacy/pipeline/_edit_tree_internals/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/pipeline/_edit_tree_internals/edit_trees.pxd b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
new file mode 100644
index 000000000..dc4289f37
--- /dev/null
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
@@ -0,0 +1,93 @@
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
+
+from ...typedefs cimport attr_t, hash_t, len_t
+from ...strings cimport StringStore
+
+cdef extern from "<algorithm>" namespace "std" nogil:
+    void swap[T](T& a, T& b) except +  # Only available in Cython 3.
+
+# An edit tree (Müller et al., 2015) is a tree structure that consists of
+# edit operations. The two types of operations are string matches
+# and string substitutions. Given an input string s and an output string t,
+# subsitution and match nodes should be interpreted as follows:
+#
+# * Substitution node: consists of an original string and substitute string.
+#   If s matches the original string, then t is the substitute. Otherwise,
+#   the node does not apply.
+# * Match node: consists of a prefix length, suffix length, prefix edit tree,
+#   and suffix edit tree. If s is composed of a prefix, middle part, and suffix
+#   with the given suffix and prefix lengths, then t is the concatenation
+#   prefix_tree(prefix) + middle + suffix_tree(suffix).
+#
+# For efficiency, we represent strings in substitution nodes as integers, with
+# the actual strings stored in a StringStore. Subtrees in match nodes are stored
+# as tree identifiers (rather than pointers) to simplify serialization.
+
+cdef uint32_t NULL_TREE_ID
+
+cdef struct MatchNodeC:
+    len_t prefix_len
+    len_t suffix_len
+    uint32_t prefix_tree
+    uint32_t suffix_tree
+
+cdef struct SubstNodeC:
+    attr_t orig
+    attr_t subst
+
+cdef union NodeC:
+    MatchNodeC match_node
+    SubstNodeC subst_node
+
+cdef struct EditTreeC:
+    bint is_match_node
+    NodeC inner
+
+cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len,
+        uint32_t prefix_tree, uint32_t suffix_tree):
+    cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len,
+            suffix_len=suffix_len, prefix_tree=prefix_tree,
+            suffix_tree=suffix_tree)
+    cdef NodeC inner = NodeC(match_node=match_node)
+    return EditTreeC(is_match_node=True, inner=inner)
+
+cdef inline EditTreeC edittree_new_subst(attr_t orig, attr_t subst):
+    cdef EditTreeC node
+    cdef SubstNodeC subst_node = SubstNodeC(orig=orig, subst=subst)
+    cdef NodeC inner = NodeC(subst_node=subst_node)
+    return EditTreeC(is_match_node=False, inner=inner)
+
+cdef inline uint64_t edittree_hash(EditTreeC tree):
+    cdef MatchNodeC match_node
+    cdef SubstNodeC subst_node
+
+    if tree.is_match_node:
+        match_node = tree.inner.match_node
+        return hash((match_node.prefix_len, match_node.suffix_len, match_node.prefix_tree, match_node.suffix_tree))
+    else:
+        subst_node = tree.inner.subst_node
+        return hash((subst_node.orig, subst_node.subst))
+
+cdef struct LCS:
+    int source_begin
+    int source_end
+    int target_begin
+    int target_end
+
+cdef inline bint lcs_is_empty(LCS lcs):
+    return lcs.source_begin == 0 and lcs.source_end == 0 and lcs.target_begin == 0 and lcs.target_end == 0
+
+cdef class EditTrees:
+    cdef vector[EditTreeC] trees
+    cdef unordered_map[hash_t, uint32_t] map
+    cdef StringStore strings
+
+    cpdef uint32_t add(self, str form, str lemma)
+    cpdef str apply(self, uint32_t tree_id, str form)
+    cpdef unicode tree_to_str(self, uint32_t tree_id)
+
+    cdef uint32_t _add(self, str form, str lemma)
+    cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces)
+    cdef uint32_t _tree_id(self, EditTreeC tree)
diff --git a/spacy/pipeline/_edit_tree_internals/edit_trees.pyx b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
new file mode 100644
index 000000000..02907b67a
--- /dev/null
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
@@ -0,0 +1,305 @@
+# cython: infer_types=True, binding=True
+from cython.operator cimport dereference as deref
+from libc.stdint cimport uint32_t
+from libc.stdint cimport UINT32_MAX
+from libc.string cimport memset
+from libcpp.pair cimport pair
+from libcpp.vector cimport vector
+
+from pathlib import Path
+
+from ...typedefs cimport hash_t
+
+from ... import util
+from ...errors import Errors
+from ...strings import StringStore
+from .schemas import validate_edit_tree
+
+
+NULL_TREE_ID = UINT32_MAX
+
+cdef LCS find_lcs(str source, str target):
+    """
+    Find the longest common subsequence (LCS) between two strings. If there are
+    multiple LCSes, only one of them is returned.
+
+    source (str): The first string.
+    target (str): The second string.
+    RETURNS (LCS): The spans of the longest common subsequences.
+    """
+    cdef Py_ssize_t source_len = len(source)
+    cdef Py_ssize_t target_len = len(target)
+    cdef size_t longest_align = 0;
+    cdef int source_idx, target_idx
+    cdef LCS lcs
+    cdef Py_UCS4 source_cp, target_cp
+
+    memset(&lcs, 0, sizeof(lcs))
+
+    cdef vector[size_t] prev_aligns = vector[size_t](target_len);
+    cdef vector[size_t] cur_aligns = vector[size_t](target_len);
+
+    for (source_idx, source_cp) in enumerate(source):
+        for (target_idx, target_cp) in enumerate(target):
+            if source_cp == target_cp:
+                if source_idx == 0 or target_idx == 0:
+                    cur_aligns[target_idx] = 1
+                else:
+                    cur_aligns[target_idx] = prev_aligns[target_idx - 1] + 1
+
+                # Check if this is the longest alignment and replace previous
+                # best alignment when this is the case.
+                if cur_aligns[target_idx] > longest_align:
+                    longest_align = cur_aligns[target_idx]
+                    lcs.source_begin = source_idx - longest_align + 1
+                    lcs.source_end = source_idx + 1
+                    lcs.target_begin = target_idx - longest_align + 1
+                    lcs.target_end = target_idx + 1
+            else:
+                # No match, we start with a zero-length alignment.
+                cur_aligns[target_idx] = 0
+        swap(prev_aligns, cur_aligns)
+
+    return lcs
+
+cdef class EditTrees:
+    """Container for constructing and storing edit trees."""
+    def __init__(self, strings: StringStore):
+        """Create a container for edit trees.
+
+        strings (StringStore): the string store to use."""
+        self.strings = strings
+
+    cpdef uint32_t add(self, str form, str lemma):
+        """Add an edit tree that rewrites the given string into the given lemma.
+
+        RETURNS (int): identifier of the edit tree in the container.
+        """
+        # Treat two empty strings as a special case. Generating an edit
+        # tree for identical strings results in a match node. However,
+        # since two empty strings have a zero-length LCS, a substitution
+        # node would be created. Since we do not want to clutter the
+        # recursive tree construction with logic for this case, handle
+        # it in this wrapper method.
+        if len(form) == 0 and len(lemma) == 0:
+            tree = edittree_new_match(0, 0, NULL_TREE_ID, NULL_TREE_ID)
+            return self._tree_id(tree)
+
+        return self._add(form, lemma)
+
+    cdef uint32_t _add(self, str form, str lemma):
+        cdef LCS lcs = find_lcs(form, lemma)
+
+        cdef EditTreeC tree
+        cdef uint32_t tree_id, prefix_tree, suffix_tree
+        if lcs_is_empty(lcs):
+            tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
+        else:
+            # If we have a non-empty LCS, such as "gooi" in "ge[gooi]d" and "[gooi]en",
+            # create edit trees for the prefix pair ("ge"/"") and the suffix pair ("d"/"en").
+            prefix_tree = NULL_TREE_ID
+            if lcs.source_begin != 0 or lcs.target_begin != 0:
+                prefix_tree = self.add(form[:lcs.source_begin], lemma[:lcs.target_begin])
+
+            suffix_tree = NULL_TREE_ID
+            if lcs.source_end != len(form) or lcs.target_end != len(lemma):
+                suffix_tree = self.add(form[lcs.source_end:], lemma[lcs.target_end:])
+
+            tree = edittree_new_match(lcs.source_begin, len(form) - lcs.source_end, prefix_tree, suffix_tree)
+
+        return self._tree_id(tree)
+
+    cdef uint32_t _tree_id(self, EditTreeC tree):
+         # If this tree has been constructed before, return its identifier.
+        cdef hash_t hash = edittree_hash(tree)
+        cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash)
+        if iter != self.map.end():
+            return deref(iter).second
+
+        #  The tree hasn't been seen before, store it.
+        cdef uint32_t tree_id = self.trees.size()
+        self.trees.push_back(tree)
+        self.map.insert(pair[hash_t, uint32_t](hash, tree_id))
+
+        return tree_id
+
+    cpdef str apply(self, uint32_t tree_id, str form):
+        """Apply an edit tree to a form.
+
+        tree_id (uint32_t): the identifier of the edit tree to apply.
+        form (str): the form to apply the edit tree to.
+        RETURNS (str): the transformer form or None if the edit tree
+            could not be applied to the form.
+        """
+        if tree_id >= self.trees.size():
+            raise IndexError("Edit tree identifier out of range")
+
+        lemma_pieces = []
+        try:
+            self._apply(tree_id, form, lemma_pieces)
+        except ValueError:
+            return None
+        return "".join(lemma_pieces)
+
+    cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces):
+        """Recursively apply an edit tree to a form, adding pieces to
+        the lemma_pieces list."""
+        assert tree_id <= self.trees.size()
+
+        cdef EditTreeC tree = self.trees[tree_id]
+        cdef MatchNodeC match_node
+        cdef int suffix_start
+
+        if tree.is_match_node:
+            match_node = tree.inner.match_node
+
+            if match_node.prefix_len + match_node.suffix_len > len(form_part):
+                raise ValueError("Edit tree cannot be applied to form")
+
+            suffix_start = len(form_part) - match_node.suffix_len
+
+            if match_node.prefix_tree != NULL_TREE_ID:
+                self._apply(match_node.prefix_tree, form_part[:match_node.prefix_len], lemma_pieces)
+
+            lemma_pieces.append(form_part[match_node.prefix_len:suffix_start])
+
+            if match_node.suffix_tree != NULL_TREE_ID:
+                self._apply(match_node.suffix_tree, form_part[suffix_start:], lemma_pieces)
+        else:
+            if form_part == self.strings[tree.inner.subst_node.orig]:
+                lemma_pieces.append(self.strings[tree.inner.subst_node.subst])
+            else:
+                raise ValueError("Edit tree cannot be applied to form")
+
+    cpdef unicode tree_to_str(self, uint32_t tree_id):
+        """Return the tree as a string. The tree tree string is formatted
+        like an S-expression. This is primarily useful for debugging. Match
+        nodes have the following format:
+
+        (m prefix_len suffix_len prefix_tree suffix_tree)
+
+        Substitution nodes have the following format:
+
+        (s original substitute)
+
+        tree_id (uint32_t): the identifier of the edit tree.
+        RETURNS (str): the tree as an S-expression.
+        """
+
+        if tree_id >= self.trees.size():
+            raise IndexError("Edit tree identifier out of range")
+
+        cdef EditTreeC tree = self.trees[tree_id]
+        cdef SubstNodeC subst_node
+
+        if not tree.is_match_node:
+            subst_node = tree.inner.subst_node
+            return f"(s '{self.strings[subst_node.orig]}' '{self.strings[subst_node.subst]}')"
+
+        cdef MatchNodeC match_node = tree.inner.match_node
+
+        prefix_tree = "()"
+        if match_node.prefix_tree != NULL_TREE_ID:
+            prefix_tree = self.tree_to_str(match_node.prefix_tree)
+
+        suffix_tree = "()"
+        if match_node.suffix_tree != NULL_TREE_ID:
+            suffix_tree = self.tree_to_str(match_node.suffix_tree)
+
+        return f"(m {match_node.prefix_len} {match_node.suffix_len} {prefix_tree} {suffix_tree})"
+
+    def from_json(self, trees: list) -> "EditTrees":
+        self.trees.clear()
+
+        for tree in trees:
+            tree = _dict2tree(tree)
+            self.trees.push_back(tree)
+
+        self._rebuild_tree_map()
+
+    def from_bytes(self, bytes_data: bytes, *) -> "EditTrees":
+        def deserialize_trees(tree_dicts):
+            cdef EditTreeC c_tree
+            for tree_dict in tree_dicts:
+                c_tree = _dict2tree(tree_dict)
+                self.trees.push_back(c_tree)
+
+        deserializers = {}
+        deserializers["trees"] = lambda n: deserialize_trees(n)
+        util.from_bytes(bytes_data, deserializers, [])
+
+        self._rebuild_tree_map()
+
+        return self
+
+    def to_bytes(self, **kwargs) -> bytes:
+        tree_dicts = []
+        for tree in self.trees:
+            tree = _tree2dict(tree)
+            tree_dicts.append(tree)
+
+        serializers = {}
+        serializers["trees"] = lambda: tree_dicts
+
+        return util.to_bytes(serializers, [])
+
+    def to_disk(self, path, **kwargs) -> "EditTrees":
+        path = util.ensure_path(path)
+        with path.open("wb") as file_:
+            file_.write(self.to_bytes())
+
+    def from_disk(self, path, **kwargs) -> "EditTrees":
+        path = util.ensure_path(path)
+        if path.exists():
+            with path.open("rb") as file_:
+                data = file_.read()
+            return self.from_bytes(data)
+
+        return self
+
+    def __getitem__(self, idx):
+        return _tree2dict(self.trees[idx])
+
+    def __len__(self):
+        return self.trees.size()
+
+    def _rebuild_tree_map(self):
+        """Rebuild the tree hash -> tree id mapping"""
+        cdef EditTreeC c_tree
+        cdef uint32_t tree_id
+        cdef hash_t tree_hash
+
+        self.map.clear()
+
+        for tree_id in range(self.trees.size()):
+            c_tree = self.trees[tree_id]
+            tree_hash = edittree_hash(c_tree)
+            self.map.insert(pair[hash_t, uint32_t](tree_hash, tree_id))
+
+    def __reduce__(self):
+        return (unpickle_edittrees, (self.strings, self.to_bytes()))
+
+
+def unpickle_edittrees(strings, trees_data):
+    return EditTrees(strings).from_bytes(trees_data)
+
+
+def _tree2dict(tree):
+    if tree["is_match_node"]:
+        tree = tree["inner"]["match_node"]
+    else:
+        tree = tree["inner"]["subst_node"]
+    return(dict(tree))
+
+def _dict2tree(tree):
+    errors = validate_edit_tree(tree)
+    if errors:
+        raise ValueError(Errors.E1026.format(errors="\n".join(errors)))
+
+    tree = dict(tree)
+    if "prefix_len" in tree:
+        tree = {"is_match_node": True, "inner": {"match_node": tree}}
+    else:
+        tree = {"is_match_node": False, "inner": {"subst_node": tree}}
+
+    return tree
diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py
new file mode 100644
index 000000000..c01d0632e
--- /dev/null
+++ b/spacy/pipeline/_edit_tree_internals/schemas.py
@@ -0,0 +1,44 @@
+from typing import Any, Dict, List, Union
+from collections import defaultdict
+from pydantic import BaseModel, Field, ValidationError
+from pydantic.types import StrictBool, StrictInt, StrictStr
+
+
+class MatchNodeSchema(BaseModel):
+    prefix_len: StrictInt = Field(..., title="Prefix length")
+    suffix_len: StrictInt = Field(..., title="Suffix length")
+    prefix_tree: StrictInt = Field(..., title="Prefix tree")
+    suffix_tree: StrictInt = Field(..., title="Suffix tree")
+
+    class Config:
+        extra = "forbid"
+
+
+class SubstNodeSchema(BaseModel):
+    orig: Union[int, StrictStr] = Field(..., title="Original substring")
+    subst: Union[int, StrictStr] = Field(..., title="Replacement substring")
+
+    class Config:
+        extra = "forbid"
+
+
+class EditTreeSchema(BaseModel):
+    __root__: Union[MatchNodeSchema, SubstNodeSchema]
+
+
+def validate_edit_tree(obj: Dict[str, Any]) -> List[str]:
+    """Validate edit tree.
+
+    obj (Dict[str, Any]): JSON-serializable data to validate.
+    RETURNS (List[str]): A list of error messages, if available.
+    """
+    try:
+        EditTreeSchema.parse_obj(obj)
+        return []
+    except ValidationError as e:
+        errors = e.errors()
+        data = defaultdict(list)
+        for error in errors:
+            err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
+            data[err_loc].append(error.get("msg"))
+        return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]  # type: ignore[arg-type]
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
new file mode 100644
index 000000000..54a7030dc
--- /dev/null
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -0,0 +1,379 @@
+from typing import cast, Any, Callable, Dict, Iterable, List, Optional
+from typing import Sequence, Tuple, Union
+from collections import Counter
+from copy import deepcopy
+from itertools import islice
+import numpy as np
+
+import srsly
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d, Ints2d
+
+from ._edit_tree_internals.edit_trees import EditTrees
+from ._edit_tree_internals.schemas import validate_edit_tree
+from .lemmatizer import lemmatizer_score
+from .trainable_pipe import TrainablePipe
+from ..errors import Errors
+from ..language import Language
+from ..tokens import Doc
+from ..training import Example, validate_examples, validate_get_examples
+from ..vocab import Vocab
+from .. import util
+
+
+default_model_config = """
+[model]
+@architectures = "spacy.Tagger.v2"
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v2"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
+"""
+DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"]
+
+
+@Language.factory(
+    "trainable_lemmatizer",
+    assigns=["token.lemma"],
+    requires=[],
+    default_config={
+        "model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
+        "backoff": "orth",
+        "min_tree_freq": 3,
+        "overwrite": False,
+        "top_k": 1,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_edit_tree_lemmatizer(
+    nlp: Language,
+    name: str,
+    model: Model,
+    backoff: Optional[str],
+    min_tree_freq: int,
+    overwrite: bool,
+    top_k: int,
+    scorer: Optional[Callable],
+):
+    """Construct an EditTreeLemmatizer component."""
+    return EditTreeLemmatizer(
+        nlp.vocab,
+        model,
+        name,
+        backoff=backoff,
+        min_tree_freq=min_tree_freq,
+        overwrite=overwrite,
+        top_k=top_k,
+        scorer=scorer,
+    )
+
+
+class EditTreeLemmatizer(TrainablePipe):
+    """
+    Lemmatizer that lemmatizes each word using a predicted edit tree.
+    """
+
+    def __init__(
+        self,
+        vocab: Vocab,
+        model: Model,
+        name: str = "trainable_lemmatizer",
+        *,
+        backoff: Optional[str] = "orth",
+        min_tree_freq: int = 3,
+        overwrite: bool = False,
+        top_k: int = 1,
+        scorer: Optional[Callable] = lemmatizer_score,
+    ):
+        """
+        Construct an edit tree lemmatizer.
+
+        backoff (Optional[str]): backoff to use when the predicted edit trees
+            are not applicable. Must be an attribute of Token or None (leave the
+            lemma unset).
+        min_tree_freq (int): prune trees that are applied less than this
+            frequency in the training data.
+        overwrite (bool): overwrite existing lemma annotations.
+        top_k (int): try to apply at most the k most probable edit trees.
+        """
+        self.vocab = vocab
+        self.model = model
+        self.name = name
+        self.backoff = backoff
+        self.min_tree_freq = min_tree_freq
+        self.overwrite = overwrite
+        self.top_k = top_k
+
+        self.trees = EditTrees(self.vocab.strings)
+        self.tree2label: Dict[int, int] = {}
+
+        self.cfg: Dict[str, Any] = {"labels": []}
+        self.scorer = scorer
+
+    def get_loss(
+        self, examples: Iterable[Example], scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        validate_examples(examples, "EditTreeLemmatizer.get_loss")
+        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
+
+        truths = []
+        for eg in examples:
+            eg_truths = []
+            for (predicted, gold_lemma) in zip(
+                eg.predicted, eg.get_aligned("LEMMA", as_string=True)
+            ):
+                if gold_lemma is None:
+                    label = -1
+                else:
+                    tree_id = self.trees.add(predicted.text, gold_lemma)
+                    label = self.tree2label.get(tree_id, 0)
+                eg_truths.append(label)
+
+            truths.append(eg_truths)
+
+        d_scores, loss = loss_func(scores, truths)  # type: ignore
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+
+        return float(loss), d_scores
+
+    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
+        n_docs = len(list(docs))
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            n_labels = len(self.cfg["labels"])
+            guesses: List[Ints2d] = [
+                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
+            ]
+            assert len(guesses) == n_docs
+            return guesses
+        scores = self.model.predict(docs)
+        assert len(scores) == n_docs
+        guesses = self._scores2guesses(docs, scores)
+        assert len(guesses) == n_docs
+        return guesses
+
+    def _scores2guesses(self, docs, scores):
+        guesses = []
+        for doc, doc_scores in zip(docs, scores):
+            if self.top_k == 1:
+                doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
+            else:
+                doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
+
+            if not isinstance(doc_guesses, np.ndarray):
+                doc_guesses = doc_guesses.get()
+
+            doc_compat_guesses = []
+            for token, candidates in zip(doc, doc_guesses):
+                tree_id = -1
+                for candidate in candidates:
+                    candidate_tree_id = self.cfg["labels"][candidate]
+
+                    if self.trees.apply(candidate_tree_id, token.text) is not None:
+                        tree_id = candidate_tree_id
+                        break
+                doc_compat_guesses.append(tree_id)
+
+            guesses.append(np.array(doc_compat_guesses))
+
+        return guesses
+
+    def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
+        for i, doc in enumerate(docs):
+            doc_tree_ids = batch_tree_ids[i]
+            if hasattr(doc_tree_ids, "get"):
+                doc_tree_ids = doc_tree_ids.get()
+            for j, tree_id in enumerate(doc_tree_ids):
+                if self.overwrite or doc[j].lemma == 0:
+                    # If no applicable tree could be found during prediction,
+                    # the special identifier -1 is used. Otherwise the tree
+                    # is guaranteed to be applicable.
+                    if tree_id == -1:
+                        if self.backoff is not None:
+                            doc[j].lemma = getattr(doc[j], self.backoff)
+                    else:
+                        lemma = self.trees.apply(tree_id, doc[j].text)
+                        doc[j].lemma_ = lemma
+
+    @property
+    def labels(self) -> Tuple[int, ...]:
+        """Returns the labels currently added to the component."""
+        return tuple(self.cfg["labels"])
+
+    @property
+    def hide_labels(self) -> bool:
+        return True
+
+    @property
+    def label_data(self) -> Dict:
+        trees = []
+        for tree_id in range(len(self.trees)):
+            tree = self.trees[tree_id]
+            if "orig" in tree:
+                tree["orig"] = self.vocab.strings[tree["orig"]]
+            if "subst" in tree:
+                tree["subst"] = self.vocab.strings[tree["subst"]]
+            trees.append(tree)
+        return dict(trees=trees, labels=tuple(self.cfg["labels"]))
+
+    def initialize(
+        self,
+        get_examples: Callable[[], Iterable[Example]],
+        *,
+        nlp: Optional[Language] = None,
+        labels: Optional[Dict] = None,
+    ):
+        validate_get_examples(get_examples, "EditTreeLemmatizer.initialize")
+
+        if labels is None:
+            self._labels_from_data(get_examples)
+        else:
+            self._add_labels(labels)
+
+        # Sample for the model.
+        doc_sample = []
+        label_sample = []
+        for example in islice(get_examples(), 10):
+            doc_sample.append(example.x)
+            gold_labels: List[List[float]] = []
+            for token in example.reference:
+                if token.lemma == 0:
+                    gold_label = None
+                else:
+                    gold_label = self._pair2label(token.text, token.lemma_)
+
+                gold_labels.append(
+                    [
+                        1.0 if label == gold_label else 0.0
+                        for label in self.cfg["labels"]
+                    ]
+                )
+
+            gold_labels = cast(Floats2d, gold_labels)
+            label_sample.append(self.model.ops.asarray(gold_labels, dtype="float32"))
+
+        self._require_labels()
+        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
+        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
+
+        self.model.initialize(X=doc_sample, Y=label_sample)
+
+    def from_bytes(self, bytes_data, *, exclude=tuple()):
+        deserializers = {
+            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
+            "model": lambda b: self.model.from_bytes(b),
+            "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
+            "trees": lambda b: self.trees.from_bytes(b),
+        }
+
+        util.from_bytes(bytes_data, deserializers, exclude)
+
+        return self
+
+    def to_bytes(self, *, exclude=tuple()):
+        serializers = {
+            "cfg": lambda: srsly.json_dumps(self.cfg),
+            "model": lambda: self.model.to_bytes(),
+            "vocab": lambda: self.vocab.to_bytes(exclude=exclude),
+            "trees": lambda: self.trees.to_bytes(),
+        }
+
+        return util.to_bytes(serializers, exclude)
+
+    def to_disk(self, path, exclude=tuple()):
+        path = util.ensure_path(path)
+        serializers = {
+            "cfg": lambda p: srsly.write_json(p, self.cfg),
+            "model": lambda p: self.model.to_disk(p),
+            "vocab": lambda p: self.vocab.to_disk(p, exclude=exclude),
+            "trees": lambda p: self.trees.to_disk(p),
+        }
+        util.to_disk(path, serializers, exclude)
+
+    def from_disk(self, path, exclude=tuple()):
+        def load_model(p):
+            try:
+                with open(p, "rb") as mfile:
+                    self.model.from_bytes(mfile.read())
+            except AttributeError:
+                raise ValueError(Errors.E149) from None
+
+        deserializers = {
+            "cfg": lambda p: self.cfg.update(srsly.read_json(p)),
+            "model": load_model,
+            "vocab": lambda p: self.vocab.from_disk(p, exclude=exclude),
+            "trees": lambda p: self.trees.from_disk(p),
+        }
+
+        util.from_disk(path, deserializers, exclude)
+        return self
+
+    def _add_labels(self, labels: Dict):
+        if "labels" not in labels:
+            raise ValueError(Errors.E857.format(name="labels"))
+        if "trees" not in labels:
+            raise ValueError(Errors.E857.format(name="trees"))
+
+        self.cfg["labels"] = list(labels["labels"])
+        trees = []
+        for tree in labels["trees"]:
+            errors = validate_edit_tree(tree)
+            if errors:
+                raise ValueError(Errors.E1026.format(errors="\n".join(errors)))
+
+            tree = dict(tree)
+            if "orig" in tree:
+                tree["orig"] = self.vocab.strings[tree["orig"]]
+            if "orig" in tree:
+                tree["subst"] = self.vocab.strings[tree["subst"]]
+
+            trees.append(tree)
+
+        self.trees.from_json(trees)
+
+        for label, tree in enumerate(self.labels):
+            self.tree2label[tree] = label
+
+    def _labels_from_data(self, get_examples: Callable[[], Iterable[Example]]):
+        # Count corpus tree frequencies in ad-hoc storage to avoid cluttering
+        # the final pipe/string store.
+        vocab = Vocab()
+        trees = EditTrees(vocab.strings)
+        tree_freqs: Counter = Counter()
+        repr_pairs: Dict = {}
+        for example in get_examples():
+            for token in example.reference:
+                if token.lemma != 0:
+                    tree_id = trees.add(token.text, token.lemma_)
+                    tree_freqs[tree_id] += 1
+                    repr_pairs[tree_id] = (token.text, token.lemma_)
+
+        # Construct trees that make the frequency cut-off using representative
+        # form - token pairs.
+        for tree_id, freq in tree_freqs.items():
+            if freq >= self.min_tree_freq:
+                form, lemma = repr_pairs[tree_id]
+                self._pair2label(form, lemma, add_label=True)
+
+    def _pair2label(self, form, lemma, add_label=False):
+        """
+        Look up the edit tree identifier for a form/label pair. If the edit
+        tree is unknown and "add_label" is set, the edit tree will be added to
+        the labels.
+        """
+        tree_id = self.trees.add(form, lemma)
+        if tree_id not in self.tree2label:
+            if not add_label:
+                return None
+
+            self.tree2label[tree_id] = len(self.cfg["labels"])
+            self.cfg["labels"].append(tree_id)
+        return self.tree2label[tree_id]
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
new file mode 100644
index 000000000..cf541e301
--- /dev/null
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -0,0 +1,280 @@
+import pickle
+import pytest
+from hypothesis import given
+import hypothesis.strategies as st
+from spacy import util
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
+from spacy.training import Example
+from spacy.strings import StringStore
+from spacy.util import make_tempdir
+
+
+TRAIN_DATA = [
+    ("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}),
+    ("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}),
+]
+
+PARTIAL_DATA = [
+    # partial annotation
+    ("She likes green eggs", {"lemmas": ["", "like", "green", ""]}),
+    # misaligned partial annotation
+    (
+        "He hates green eggs",
+        {
+            "words": ["He", "hat", "es", "green", "eggs"],
+            "lemmas": ["", "hat", "e", "green", ""],
+        },
+    ),
+]
+
+
+def test_initialize_examples():
+    nlp = Language()
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    # you shouldn't really call this more than once, but for testing it should be fine
+    nlp.initialize(get_examples=lambda: train_examples)
+    with pytest.raises(TypeError):
+        nlp.initialize(get_examples=lambda: None)
+    with pytest.raises(TypeError):
+        nlp.initialize(get_examples=lambda: train_examples[0])
+    with pytest.raises(TypeError):
+        nlp.initialize(get_examples=lambda: [])
+    with pytest.raises(TypeError):
+        nlp.initialize(get_examples=train_examples)
+
+
+def test_initialize_from_labels():
+    nlp = Language()
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    nlp2 = Language()
+    lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
+    lemmatizer2.initialize(
+        get_examples=lambda: train_examples,
+        labels=lemmatizer.label_data,
+    )
+    assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
+
+
+def test_no_data():
+    # Test that the lemmatizer provides a nice error when there's no tagging data / labels
+    TEXTCAT_DATA = [
+        ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
+        ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
+    ]
+    nlp = English()
+    nlp.add_pipe("trainable_lemmatizer")
+    nlp.add_pipe("textcat")
+
+    train_examples = []
+    for t in TEXTCAT_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    with pytest.raises(ValueError):
+        nlp.initialize(get_examples=lambda: train_examples)
+
+
+def test_incomplete_data():
+    # Test that the lemmatizer works with incomplete information
+    nlp = English()
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in PARTIAL_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    # test the trained model
+    test_text = "She likes blue eggs"
+    doc = nlp(test_text)
+    assert doc[1].lemma_ == "like"
+    assert doc[2].lemma_ == "blue"
+
+
+def test_overfitting_IO():
+    nlp = English()
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    test_text = "She likes blue eggs"
+    doc = nlp(test_text)
+    assert doc[0].lemma_ == "she"
+    assert doc[1].lemma_ == "like"
+    assert doc[2].lemma_ == "blue"
+    assert doc[3].lemma_ == "egg"
+
+    # Check model after a {to,from}_disk roundtrip
+    with util.make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+        assert doc2[0].lemma_ == "she"
+        assert doc2[1].lemma_ == "like"
+        assert doc2[2].lemma_ == "blue"
+        assert doc2[3].lemma_ == "egg"
+
+    # Check model after a {to,from}_bytes roundtrip
+    nlp_bytes = nlp.to_bytes()
+    nlp3 = English()
+    nlp3.add_pipe("trainable_lemmatizer")
+    nlp3.from_bytes(nlp_bytes)
+    doc3 = nlp3(test_text)
+    assert doc3[0].lemma_ == "she"
+    assert doc3[1].lemma_ == "like"
+    assert doc3[2].lemma_ == "blue"
+    assert doc3[3].lemma_ == "egg"
+
+    # Check model after a pickle roundtrip.
+    nlp_bytes = pickle.dumps(nlp)
+    nlp4 = pickle.loads(nlp_bytes)
+    doc4 = nlp4(test_text)
+    assert doc4[0].lemma_ == "she"
+    assert doc4[1].lemma_ == "like"
+    assert doc4[2].lemma_ == "blue"
+    assert doc4[3].lemma_ == "egg"
+
+
+def test_lemmatizer_requires_labels():
+    nlp = English()
+    nlp.add_pipe("trainable_lemmatizer")
+    with pytest.raises(ValueError):
+        nlp.initialize()
+
+
+def test_lemmatizer_label_data():
+    nlp = English()
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    nlp2 = English()
+    lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
+    lemmatizer2.initialize(
+        get_examples=lambda: train_examples, labels=lemmatizer.label_data
+    )
+
+    # Verify that the labels and trees are the same.
+    assert lemmatizer.labels == lemmatizer2.labels
+    assert lemmatizer.trees.to_bytes() == lemmatizer2.trees.to_bytes()
+
+
+def test_dutch():
+    strings = StringStore()
+    trees = EditTrees(strings)
+    tree = trees.add("deelt", "delen")
+    assert trees.tree_to_str(tree) == "(m 0 3 () (m 0 2 (s '' 'l') (s 'lt' 'n')))"
+
+    tree = trees.add("gedeeld", "delen")
+    assert (
+        trees.tree_to_str(tree) == "(m 2 3 (s 'ge' '') (m 0 2 (s '' 'l') (s 'ld' 'n')))"
+    )
+
+
+def test_from_to_bytes():
+    strings = StringStore()
+    trees = EditTrees(strings)
+    trees.add("deelt", "delen")
+    trees.add("gedeeld", "delen")
+
+    b = trees.to_bytes()
+
+    trees2 = EditTrees(strings)
+    trees2.from_bytes(b)
+
+    # Verify that the nodes did not change.
+    assert len(trees) == len(trees2)
+    for i in range(len(trees)):
+        assert trees.tree_to_str(i) == trees2.tree_to_str(i)
+
+    # Reinserting the same trees should not add new nodes.
+    trees2.add("deelt", "delen")
+    trees2.add("gedeeld", "delen")
+    assert len(trees) == len(trees2)
+
+
+def test_from_to_disk():
+    strings = StringStore()
+    trees = EditTrees(strings)
+    trees.add("deelt", "delen")
+    trees.add("gedeeld", "delen")
+
+    trees2 = EditTrees(strings)
+    with make_tempdir() as temp_dir:
+        trees_file = temp_dir / "edit_trees.bin"
+        trees.to_disk(trees_file)
+        trees2 = trees2.from_disk(trees_file)
+
+    # Verify that the nodes did not change.
+    assert len(trees) == len(trees2)
+    for i in range(len(trees)):
+        assert trees.tree_to_str(i) == trees2.tree_to_str(i)
+
+    # Reinserting the same trees should not add new nodes.
+    trees2.add("deelt", "delen")
+    trees2.add("gedeeld", "delen")
+    assert len(trees) == len(trees2)
+
+
+@given(st.text(), st.text())
+def test_roundtrip(form, lemma):
+    strings = StringStore()
+    trees = EditTrees(strings)
+    tree = trees.add(form, lemma)
+    assert trees.apply(tree, form) == lemma
+
+
+@given(st.text(alphabet="ab"), st.text(alphabet="ab"))
+def test_roundtrip_small_alphabet(form, lemma):
+    # Test with small alphabets to have more overlap.
+    strings = StringStore()
+    trees = EditTrees(strings)
+    tree = trees.add(form, lemma)
+    assert trees.apply(tree, form) == lemma
+
+
+def test_unapplicable_trees():
+    strings = StringStore()
+    trees = EditTrees(strings)
+    tree3 = trees.add("deelt", "delen")
+
+    # Replacement fails.
+    assert trees.apply(tree3, "deeld") == None
+
+    # Suffix + prefix are too large.
+    assert trees.apply(tree3, "de") == None
+
+
+def test_empty_strings():
+    strings = StringStore()
+    trees = EditTrees(strings)
+    no_change = trees.add("xyz", "xyz")
+    empty = trees.add("", "")
+    assert no_change == empty
diff --git a/website/docs/api/edittreelemmatizer.md b/website/docs/api/edittreelemmatizer.md
new file mode 100644
index 000000000..99a705f5e
--- /dev/null
+++ b/website/docs/api/edittreelemmatizer.md
@@ -0,0 +1,409 @@
+---
+title: EditTreeLemmatizer
+tag: class
+source: spacy/pipeline/edit_tree_lemmatizer.py
+new: 3.3
+teaser: 'Pipeline component for lemmatization'
+api_base_class: /api/pipe
+api_string_name: trainable_lemmatizer
+api_trainable: true
+---
+
+A trainable component for assigning base forms to tokens. This lemmatizer uses
+**edit trees** to transform tokens into base forms. The lemmatization model
+predicts which edit tree is applicable to a token. The edit tree data structure
+and construction method used by this lemmatizer were proposed in
+[Joint Lemmatization and Morphological Tagging with Lemming](https://aclanthology.org/D15-1272.pdf)
+(Thomas Müller et al., 2015).
+
+For a lookup and rule-based lemmatizer, see [`Lemmatizer`](/api/lemmatizer).
+
+## Assigned Attributes {#assigned-attributes}
+
+Predictions are assigned to `Token.lemma`.
+
+| Location       | Value                     |
+| -------------- | ------------------------- |
+| `Token.lemma`  | The lemma (hash). ~~int~~ |
+| `Token.lemma_` | The lemma. ~~str~~        |
+
+## Config and implementation {#config}
+
+The default config is defined by the pipeline component factory and describes
+how the component should be configured. You can override its settings via the
+`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
+[`config.cfg` for training](/usage/training#config). See the
+[model architectures](/api/architectures) documentation for details on the
+architectures and their arguments and hyperparameters.
+
+> #### Example
+>
+> ```python
+> from spacy.pipeline.edit_tree_lemmatizer import DEFAULT_EDIT_TREE_LEMMATIZER_MODEL
+> config = {"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL}
+> nlp.add_pipe("trainable_lemmatizer", config=config, name="lemmatizer")
+> ```
+
+| Setting         | Description                                                                                                                                                                                                                                                                                                        |
+| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`         | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `backoff`       | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~                                                                                                                                                                                                                      |
+| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~                                                                                                                                                                                                                         |
+| `overwrite`     | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                                          |
+| `top_k`         | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~                                                                                                                                                                                                              |
+| `scorer`        | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~                                                                                                                                                                      |
+
+```python
+%%GITHUB_SPACY/spacy/pipeline/edit_tree_lemmatizer.py
+```
+
+## EditTreeLemmatizer.\_\_init\_\_ {#init tag="method"}
+
+> #### Example
+>
+> ```python
+> # Construction via add_pipe with default model
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+>
+> # Construction via create_pipe with custom model
+> config = {"model": {"@architectures": "my_tagger"}}
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", config=config, name="lemmatizer")
+>
+> # Construction from class
+> from spacy.pipeline import EditTreeLemmatizer
+> lemmatizer = EditTreeLemmatizer(nlp.vocab, model)
+> ```
+
+Create a new pipeline instance. In your application, you would normally use a
+shortcut for this and instantiate the component using its string name and
+[`nlp.add_pipe`](/api/language#add_pipe).
+
+| Name            | Description                                                                                                                                                                                                                                                       |
+| --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`         | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                  |
+| `model`         | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `name`          | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                               |
+| _keyword-only_  |                                                                                                                                                                                                                                                                   |
+| `backoff`       | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~                                                                                                                                                                     |
+| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~                                                                                                                                                                        |
+| `overwrite`     | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                         |
+| `top_k`         | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~                                                                                                                                                             |
+| `scorer`        | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~                                                                                                                     |
+
+## EditTreeLemmatizer.\_\_call\_\_ {#call tag="method"}
+
+Apply the pipe to one document. The document is modified in place, and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order. Both
+[`__call__`](/api/edittreelemmatizer#call) and
+[`pipe`](/api/edittreelemmatizer#pipe) delegate to the
+[`predict`](/api/edittreelemmatizer#predict) and
+[`set_annotations`](/api/edittreelemmatizer#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a sentence.")
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> # This usually happens under the hood
+> processed = lemmatizer(doc)
+> ```
+
+| Name        | Description                      |
+| ----------- | -------------------------------- |
+| `doc`       | The document to process. ~~Doc~~ |
+| **RETURNS** | The processed document. ~~Doc~~  |
+
+## EditTreeLemmatizer.pipe {#pipe tag="method"}
+
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order. Both [`__call__`](/api/edittreelemmatizer#call)
+and [`pipe`](/api/edittreelemmatizer#pipe) delegate to the
+[`predict`](/api/edittreelemmatizer#predict) and
+[`set_annotations`](/api/edittreelemmatizer#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> for doc in lemmatizer.pipe(docs, batch_size=50):
+>     pass
+> ```
+
+| Name           | Description                                                   |
+| -------------- | ------------------------------------------------------------- |
+| `stream`       | A stream of documents. ~~Iterable[Doc]~~                      |
+| _keyword-only_ |                                                               |
+| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
+| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
+
+## EditTreeLemmatizer.initialize {#initialize tag="method" new="3"}
+
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> lemmatizer.initialize(lambda: [], nlp=nlp)
+> ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.lemmatizer]
+>
+> [initialize.components.lemmatizer.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/lemmatizer.json
+> ```
+
+| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                                      |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                                                                                                            |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
+| `labels`       | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
+
+## EditTreeLemmatizer.predict {#predict tag="method"}
+
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> tree_ids = lemmatizer.predict([doc1, doc2])
+> ```
+
+| Name        | Description                                 |
+| ----------- | ------------------------------------------- |
+| `docs`      | The documents to predict. ~~Iterable[Doc]~~ |
+| **RETURNS** | The model's prediction for each document.   |
+
+## EditTreeLemmatizer.set_annotations {#set_annotations tag="method"}
+
+Modify a batch of [`Doc`](/api/doc) objects, using pre-computed tree
+identifiers.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> tree_ids = lemmatizer.predict([doc1, doc2])
+> lemmatizer.set_annotations([doc1, doc2], tree_ids)
+> ```
+
+| Name       | Description                                                                           |
+| ---------- | ------------------------------------------------------------------------------------- |
+| `docs`     | The documents to modify. ~~Iterable[Doc]~~                                            |
+| `tree_ids` | The identifiers of the edit trees to apply, produced by `EditTreeLemmatizer.predict`. |
+
+## EditTreeLemmatizer.update {#update tag="method"}
+
+Learn from a batch of [`Example`](/api/example) objects containing the
+predictions and gold-standard annotations, and update the component's model.
+Delegates to [`predict`](/api/edittreelemmatizer#predict) and
+[`get_loss`](/api/edittreelemmatizer#get_loss).
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> optimizer = nlp.initialize()
+> losses = lemmatizer.update(examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                              |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
+| _keyword-only_ |                                                                                                                          |
+| `drop`         | The dropout rate. ~~float~~                                                                                              |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
+| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
+
+## EditTreeLemmatizer.get_loss {#get_loss tag="method"}
+
+Find the loss and gradient of loss for the batch of documents and their
+predicted scores.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> scores = lemmatizer.model.begin_update([eg.predicted for eg in examples])
+> loss, d_loss = lemmatizer.get_loss(examples, scores)
+> ```
+
+| Name        | Description                                                                 |
+| ----------- | --------------------------------------------------------------------------- |
+| `examples`  | The batch of examples. ~~Iterable[Example]~~                                |
+| `scores`    | Scores representing the model's predictions.                                |
+| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
+## EditTreeLemmatizer.create_optimizer {#create_optimizer tag="method"}
+
+Create an optimizer for the pipeline component.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> optimizer = lemmatizer.create_optimizer()
+> ```
+
+| Name        | Description                  |
+| ----------- | ---------------------------- |
+| **RETURNS** | The optimizer. ~~Optimizer~~ |
+
+## EditTreeLemmatizer.use_params {#use_params tag="method, contextmanager"}
+
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> with lemmatizer.use_params(optimizer.averages):
+>     lemmatizer.to_disk("/best_model")
+> ```
+
+| Name     | Description                                        |
+| -------- | -------------------------------------------------- |
+| `params` | The parameter values to use in the model. ~~dict~~ |
+
+## EditTreeLemmatizer.to_disk {#to_disk tag="method"}
+
+Serialize the pipe to disk.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> lemmatizer.to_disk("/path/to/lemmatizer")
+> ```
+
+| Name           | Description                                                                                                                                |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path`         | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ |                                                                                                                                            |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |
+
+## EditTreeLemmatizer.from_disk {#from_disk tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> lemmatizer.from_disk("/path/to/lemmatizer")
+> ```
+
+| Name           | Description                                                                                     |
+| -------------- | ----------------------------------------------------------------------------------------------- |
+| `path`         | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ |                                                                                                 |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
+| **RETURNS**    | The modified `EditTreeLemmatizer` object. ~~EditTreeLemmatizer~~                                |
+
+## EditTreeLemmatizer.to_bytes {#to_bytes tag="method"}
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> lemmatizer_bytes = lemmatizer.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring.
+
+| Name           | Description                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| _keyword-only_ |                                                                                             |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS**    | The serialized form of the `EditTreeLemmatizer` object. ~~bytes~~                           |
+
+## EditTreeLemmatizer.from_bytes {#from_bytes tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> lemmatizer_bytes = lemmatizer.to_bytes()
+> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+> lemmatizer.from_bytes(lemmatizer_bytes)
+> ```
+
+| Name           | Description                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `bytes_data`   | The data to load from. ~~bytes~~                                                            |
+| _keyword-only_ |                                                                                             |
+| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS**    | The `EditTreeLemmatizer` object. ~~EditTreeLemmatizer~~                                     |
+
+## EditTreeLemmatizer.labels {#labels tag="property"}
+
+The labels currently added to the component.
+
+<Infobox variant="warning" title="Interpretability of the labels">
+
+The `EditTreeLemmatizer` labels are not useful by themselves, since they are
+identifiers of edit trees.
+
+</Infobox>
+
+| Name        | Description                                            |
+| ----------- | ------------------------------------------------------ |
+| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
+
+## EditTreeLemmatizer.label_data {#label_data tag="property" new="3"}
+
+The labels currently added to the component and their internal meta information.
+This is the data generated by [`init labels`](/api/cli#init-labels) and used by
+[`EditTreeLemmatizer.initialize`](/api/edittreelemmatizer#initialize) to
+initialize the model with a pre-defined label set.
+
+> #### Example
+>
+> ```python
+> labels = lemmatizer.label_data
+> lemmatizer.initialize(lambda: [], nlp=nlp, labels=labels)
+> ```
+
+| Name        | Description                                                |
+| ----------- | ---------------------------------------------------------- |
+| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ |
+
+## Serialization fields {#serialization-fields}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = lemmatizer.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name    | Description                                                    |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab).                              |
+| `cfg`   | The config file. You usually don't want to exclude this.       |
+| `model` | The binary model data. You usually don't want to exclude this. |
+| `trees` | The edit trees. You usually don't want to exclude this.        |
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 2fa040917..75387305a 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -9,14 +9,15 @@ api_trainable: false
 ---
 
 Component for assigning base forms to tokens using rules based on part-of-speech
-tags, or lookup tables. Functionality to train the component is coming soon.
-Different [`Language`](/api/language) subclasses can implement their own
-lemmatizer components via
+tags, or lookup tables. Different [`Language`](/api/language) subclasses can
+implement their own lemmatizer components via
 [language-specific factories](/usage/processing-pipelines#factories-language).
 The default data used is provided by the
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
 extension package.
 
+For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer).
+
 <Infobox variant="warning" title="New in v3.0">
 
 As of v3.0, the `Lemmatizer` is a **standalone pipeline component** that can be
diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md
index 8fb452895..22e2b961e 100644
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@@ -45,10 +45,11 @@ components for different language processing tasks and also allows adding
 | ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
 | [`AttributeRuler`](/api/attributeruler)         | Set token attributes using matcher rules.                                                   |
 | [`DependencyParser`](/api/dependencyparser)     | Predict syntactic dependencies.                                                             |
+| [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words.                                                                |
 | [`EntityLinker`](/api/entitylinker)             | Disambiguate named entities to nodes in a knowledge base.                                   |
 | [`EntityRecognizer`](/api/entityrecognizer)     | Predict named entities, e.g. persons or products.                                           |
 | [`EntityRuler`](/api/entityruler)               | Add entity spans to the `Doc` using token-based rules or exact phrase matches.              |
-| [`Lemmatizer`](/api/lemmatizer)                 | Determine the base forms of words.                                                          |
+| [`Lemmatizer`](/api/lemmatizer)                 | Determine the base forms of words using rules and lookups.                                  |
 | [`Morphologizer`](/api/morphologizer)           | Predict morphological features and coarse-grained part-of-speech tags.                      |
 | [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries.                                                                |
 | [`Sentencizer`](/api/sentencizer)               | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index c3f25565a..b3b896a54 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -120,10 +120,13 @@ print(doc[2].pos_)  # 'PRON'
 
 ## Lemmatization {#lemmatization model="lemmatizer" new="3"}
 
-The [`Lemmatizer`](/api/lemmatizer) is a pipeline component that provides lookup
-and rule-based lemmatization methods in a configurable component. An individual
-language can extend the `Lemmatizer` as part of its
-[language data](#language-data).
+spaCy provides two pipeline components for lemmatization:
+
+1. The [`Lemmatizer`](/api/lemmatizer) component provides lookup and rule-based
+   lemmatization methods in a configurable component. An individual language can
+   extend the `Lemmatizer` as part of its [language data](#language-data).
+2. The [`EditTreeLemmatizer`](/api/edittreelemmatizer)
+   <Tag variant="new">3.3</Tag> component provides a trainable lemmatizer.
 
 ```python
 ### {executable="true"}
@@ -197,6 +200,20 @@ information, without consulting the context of the token. The rule-based
 lemmatizer also accepts list-based exception files. For English, these are
 acquired from [WordNet](https://wordnet.princeton.edu/).
 
+### Trainable lemmatizer
+
+The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma
+transformations from a training corpus that includes lemma annotations. This
+removes the need to write language-specific rules and can (in many cases)
+provide higher accuracies than lookup and rule-based lemmatizers.
+
+```python
+import spacy
+
+nlp = spacy.blank("de")
+nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
+```
+
 ## Dependency Parsing {#dependency-parse model="parser"}
 
 spaCy features a fast and accurate syntactic dependency parser, and has a rich
@@ -1189,7 +1206,7 @@ class WhitespaceTokenizer:
             spaces = spaces[0:-1]
         else:
            spaces[-1] = False
-            
+
         return Doc(self.vocab, words=words, spaces=spaces)
 
 nlp = spacy.blank("en")
@@ -1269,8 +1286,8 @@ hyperparameters, pipeline and tokenizer used for constructing and training the
 pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that
 takes the `nlp` object and returns a tokenizer. Here, we're registering a
 function called `whitespace_tokenizer` in the
-[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how to
-construct your tokenizer during training, you can pass in your Python file by
+[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how
+to construct your tokenizer during training, you can pass in your Python file by
 setting `--code functions.py` when you run [`spacy train`](/api/cli#train).
 
 > #### config.cfg
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 9e6ee54df..4f75b5193 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -303,22 +303,23 @@ available pipeline components and component functions.
 > ruler = nlp.add_pipe("entity_ruler")
 > ```
 
-| String name          | Component                                            | Description                                                                               |
-| -------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- |
-| `tagger`             | [`Tagger`](/api/tagger)                              | Assign part-of-speech-tags.                                                               |
-| `parser`             | [`DependencyParser`](/api/dependencyparser)          | Assign dependency labels.                                                                 |
-| `ner`                | [`EntityRecognizer`](/api/entityrecognizer)          | Assign named entities.                                                                    |
-| `entity_linker`      | [`EntityLinker`](/api/entitylinker)                  | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
-| `entity_ruler`       | [`EntityRuler`](/api/entityruler)                    | Assign named entities based on pattern rules and dictionaries.                            |
-| `textcat`            | [`TextCategorizer`](/api/textcategorizer)            | Assign text categories: exactly one category is predicted per document.                   |
-| `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document.   |
-| `lemmatizer`         | [`Lemmatizer`](/api/lemmatizer)                      | Assign base forms to words.                                                               |
-| `morphologizer`      | [`Morphologizer`](/api/morphologizer)                | Assign morphological features and coarse-grained POS tags.                                |
-| `attribute_ruler`    | [`AttributeRuler`](/api/attributeruler)              | Assign token attribute mappings and rule-based exceptions.                                |
-| `senter`             | [`SentenceRecognizer`](/api/sentencerecognizer)      | Assign sentence boundaries.                                                               |
-| `sentencizer`        | [`Sentencizer`](/api/sentencizer)                    | Add rule-based sentence segmentation without the dependency parse.                        |
-| `tok2vec`            | [`Tok2Vec`](/api/tok2vec)                            | Assign token-to-vector embeddings.                                                        |
-| `transformer`        | [`Transformer`](/api/transformer)                    | Assign the tokens and outputs of a transformer model.                                     |
+| String name            | Component                                            | Description                                                                               |
+| ---------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- |
+| `tagger`               | [`Tagger`](/api/tagger)                              | Assign part-of-speech-tags.                                                               |
+| `parser`               | [`DependencyParser`](/api/dependencyparser)          | Assign dependency labels.                                                                 |
+| `ner`                  | [`EntityRecognizer`](/api/entityrecognizer)          | Assign named entities.                                                                    |
+| `entity_linker`        | [`EntityLinker`](/api/entitylinker)                  | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
+| `entity_ruler`         | [`EntityRuler`](/api/entityruler)                    | Assign named entities based on pattern rules and dictionaries.                            |
+| `textcat`              | [`TextCategorizer`](/api/textcategorizer)            | Assign text categories: exactly one category is predicted per document.                   |
+| `textcat_multilabel`   | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document.   |
+| `lemmatizer`           | [`Lemmatizer`](/api/lemmatizer)                      | Assign base forms to words using rules and lookups.                                       |
+| `trainable_lemmatizer` | [`EditTreeLemmatizer`](/api/edittreelemmatizer)      | Assign base forms to words.                                                               |
+| `morphologizer`        | [`Morphologizer`](/api/morphologizer)                | Assign morphological features and coarse-grained POS tags.                                |
+| `attribute_ruler`      | [`AttributeRuler`](/api/attributeruler)              | Assign token attribute mappings and rule-based exceptions.                                |
+| `senter`               | [`SentenceRecognizer`](/api/sentencerecognizer)      | Assign sentence boundaries.                                                               |
+| `sentencizer`          | [`Sentencizer`](/api/sentencizer)                    | Add rule-based sentence segmentation without the dependency parse.                        |
+| `tok2vec`              | [`Tok2Vec`](/api/tok2vec)                            | Assign token-to-vector embeddings.                                                        |
+| `transformer`          | [`Transformer`](/api/transformer)                    | Assign the tokens and outputs of a transformer model.                                     |
 
 ### Disabling, excluding and modifying components {#disabling}
 
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index c49b49c73..2229c91f3 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -93,6 +93,7 @@
                 "items": [
                     { "text": "AttributeRuler", "url": "/api/attributeruler" },
                     { "text": "DependencyParser", "url": "/api/dependencyparser" },
+                    { "text": "EditTreeLemmatizer", "url": "/api/edittreelemmatizer" },
                     { "text": "EntityLinker", "url": "/api/entitylinker" },
                     { "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
                     { "text": "EntityRuler", "url": "/api/entityruler" },

From 9966e08f32cf59963591b589bf816aa114ed39a9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 29 Mar 2022 11:15:24 +0200
Subject: [PATCH 085/123] Add click pin to avoid typer issues (#10573)

---
 requirements.txt | 1 +
 setup.cfg        | 1 +
 2 files changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 7b9d343a9..a9cdbb64e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ wasabi>=0.8.1,<1.1.0
 srsly>=2.4.1,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.5.0
+click<8.1.0
 pathy>=0.3.5
 # Third party dependencies
 numpy>=1.15.0
diff --git a/setup.cfg b/setup.cfg
index 3c5ba884a..e56b27357 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -52,6 +52,7 @@ install_requires =
     srsly>=2.4.1,<3.0.0
     catalogue>=2.0.6,<2.1.0
     typer>=0.3.0,<0.5.0
+    click<8.1.0
     pathy>=0.3.5
     # Third-party dependencies
     tqdm>=4.38.0,<5.0.0

From f98b41c3904bf0b0aa094a2bd1b5f5330d5989d7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 30 Mar 2022 08:54:23 +0200
Subject: [PATCH 086/123] Add vector deduplication (#10551)

* Add vector deduplication

* Add `Vocab.deduplicate_vectors()`
* Always run deduplication in `spacy init vectors`
* Clean up a few vector-related error messages and docs examples

* Always unique with numpy

* Fix types
---
 spacy/errors.py                           |  2 +-
 spacy/tests/vocab_vectors/test_vectors.py | 33 +++++++++++++++++++++++
 spacy/training/initialize.py              |  2 ++
 spacy/vocab.pyi                           |  1 +
 spacy/vocab.pyx                           | 33 ++++++++++++++++++++++-
 website/docs/api/vocab.md                 | 21 +++++++++++----
 6 files changed, 85 insertions(+), 7 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 8980ca3c3..a0cd2ef34 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -528,7 +528,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E858 = ("The {mode} vector table does not support this operation. "
             "{alternative}")
     E859 = ("The floret vector table cannot be modified.")
-    E860 = ("Can't truncate fasttext-bloom vectors.")
+    E860 = ("Can't truncate floret vectors.")
     E861 = ("No 'keys' should be provided when initializing floret vectors "
             "with 'minn' and 'maxn'.")
     E862 = ("'hash_count' must be between 1-4 for floret vectors.")
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index ffd7489b2..e3ad206f4 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -455,6 +455,39 @@ def test_vectors_get_batch():
     assert_equal(OPS.to_numpy(vecs), OPS.to_numpy(v.get_batch(words)))
 
 
+def test_vectors_deduplicate():
+    data = OPS.asarray([[1, 1], [2, 2], [3, 4], [1, 1], [3, 4]], dtype="f")
+    v = Vectors(data=data, keys=["a1", "b1", "c1", "a2", "c2"])
+    vocab = Vocab()
+    vocab.vectors = v
+    # duplicate vectors do not use the same keys
+    assert (
+        vocab.vectors.key2row[v.strings["a1"]] != vocab.vectors.key2row[v.strings["a2"]]
+    )
+    assert (
+        vocab.vectors.key2row[v.strings["c1"]] != vocab.vectors.key2row[v.strings["c2"]]
+    )
+    vocab.deduplicate_vectors()
+    # there are three unique vectors
+    assert vocab.vectors.shape[0] == 3
+    # the uniqued data is the same as the deduplicated data
+    assert_equal(
+        numpy.unique(OPS.to_numpy(vocab.vectors.data), axis=0),
+        OPS.to_numpy(vocab.vectors.data),
+    )
+    # duplicate vectors use the same keys now
+    assert (
+        vocab.vectors.key2row[v.strings["a1"]] == vocab.vectors.key2row[v.strings["a2"]]
+    )
+    assert (
+        vocab.vectors.key2row[v.strings["c1"]] == vocab.vectors.key2row[v.strings["c2"]]
+    )
+    # deduplicating again makes no changes
+    vocab_b = vocab.to_bytes()
+    vocab.deduplicate_vectors()
+    assert vocab_b == vocab.to_bytes()
+
+
 @pytest.fixture()
 def floret_vectors_hashvec_str():
     """The full hashvec table from floret with the settings:
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index b59288e38..48ff7b589 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -213,6 +213,7 @@ def convert_vectors(
         for lex in nlp.vocab:
             if lex.rank and lex.rank != OOV_RANK:
                 nlp.vocab.vectors.add(lex.orth, row=lex.rank)  # type: ignore[attr-defined]
+        nlp.vocab.deduplicate_vectors()
     else:
         if vectors_loc:
             logger.info(f"Reading vectors from {vectors_loc}")
@@ -239,6 +240,7 @@ def convert_vectors(
                 nlp.vocab.vectors = Vectors(
                     strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
                 )
+                nlp.vocab.deduplicate_vectors()
     if name is None:
         # TODO: Is this correct? Does this matter?
         nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index 713e85c01..4cc359c47 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -46,6 +46,7 @@ class Vocab:
     def reset_vectors(
         self, *, width: Optional[int] = ..., shape: Optional[int] = ...
     ) -> None: ...
+    def deduplicate_vectors(self) -> None: ...
     def prune_vectors(self, nr_row: int, batch_size: int = ...) -> Dict[str, float]: ...
     def get_vector(
         self,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 58036fffa..428cadd82 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -1,6 +1,7 @@
 # cython: profile=True
 from libc.string cimport memcpy
 
+import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
 import functools
@@ -297,6 +298,33 @@ cdef class Vocab:
             width = width if width is not None else self.vectors.shape[1]
             self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
 
+    def deduplicate_vectors(self):
+        if self.vectors.mode != VectorsMode.default:
+            raise ValueError(Errors.E858.format(
+                mode=self.vectors.mode,
+                alternative=""
+            ))
+        ops = get_current_ops()
+        xp = get_array_module(self.vectors.data)
+        filled = xp.asarray(
+            sorted(list({row for row in self.vectors.key2row.values()}))
+        )
+        # deduplicate data and remap keys
+        data = numpy.unique(ops.to_numpy(self.vectors.data[filled]), axis=0)
+        data = ops.asarray(data)
+        if data.shape == self.vectors.data.shape:
+            # nothing to deduplicate
+            return
+        row_by_bytes = {row.tobytes(): i for i, row in enumerate(data)}
+        key2row = {
+            key: row_by_bytes[self.vectors.data[row].tobytes()]
+            for key, row in self.vectors.key2row.items()
+        }
+        # replace vectors with deduplicated version
+        self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name)
+        for key, row in key2row.items():
+            self.vectors.add(key, row=row)
+
     def prune_vectors(self, nr_row, batch_size=1024):
         """Reduce the current vector table to `nr_row` unique entries. Words
         mapped to the discarded vectors will be remapped to the closest vector
@@ -325,7 +353,10 @@ cdef class Vocab:
         DOCS: https://spacy.io/api/vocab#prune_vectors
         """
         if self.vectors.mode != VectorsMode.default:
-            raise ValueError(Errors.E866)
+            raise ValueError(Errors.E858.format(
+                mode=self.vectors.mode,
+                alternative=""
+            ))
         ops = get_current_ops()
         xp = get_array_module(self.vectors.data)
         # Make sure all vectors are in the vocab
diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index 4698c68c3..2e4a206ec 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -156,7 +156,7 @@ cosines are calculated in minibatches to reduce memory usage.
 >
 > ```python
 > nlp.vocab.prune_vectors(10000)
-> assert len(nlp.vocab.vectors) <= 1000
+> assert len(nlp.vocab.vectors) <= 10000
 > ```
 
 | Name         | Description                                                                                                                                                                                                                  |
@@ -165,6 +165,17 @@ cosines are calculated in minibatches to reduce memory usage.
 | `batch_size` | Batch of vectors for calculating the similarities. Larger batch sizes might be faster, while temporarily requiring more memory. ~~int~~                                                                                      |
 | **RETURNS**  | A dictionary keyed by removed words mapped to `(string, score)` tuples, where `string` is the entry the removed word was mapped to, and `score` the similarity score between the two words. ~~Dict[str, Tuple[str, float]]~~ |
 
+## Vocab.deduplicate_vectors {#deduplicate_vectors tag="method" new="3.3"}
+
+> #### Example
+>
+> ```python
+> nlp.vocab.deduplicate_vectors()
+> ```
+
+Remove any duplicate rows from the current vector table, maintaining the
+mappings for all words in the vectors.
+
 ## Vocab.get_vector {#get_vector tag="method" new="2"}
 
 Retrieve a vector for a word in the vocabulary. Words can be looked up by string
@@ -178,10 +189,10 @@ or hash value. If the current vectors do not contain an entry for the word, a
 > nlp.vocab.get_vector("apple")
 > ```
 
-| Name                                | Description                                                                                                            |
-| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
-| `orth`                              | The hash value of a word, or its unicode string. ~~Union[int, str]~~                                                   |
-| **RETURNS**                         | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+| Name        | Description                                                                                                            |
+| ----------- | ---------------------------------------------------------------------------------------------------------------------- |
+| `orth`      | The hash value of a word, or its unicode string. ~~Union[int, str]~~                                                   |
+| **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
 
 ## Vocab.set_vector {#set_vector tag="method" new="2"}
 

From 36d3af301394591d9b5eaee175c08b01b319e96c Mon Sep 17 00:00:00 2001
From: Yunus Atahan <yunus10atahan@gmail.com>
Date: Wed, 30 Mar 2022 14:16:08 +0300
Subject: [PATCH 087/123] Fixed typo in Turkish lang. (#10582)

* added failing test case for the issue.

* Fixed typo.

* fixed typo in test.

* added corrected typo word into test_tr_lex_attrs_capitals as param. Test passes. Also tried and confirmed that test is failing after fixing the typo in the test case I wrote. Deleted the test case for typo.

Co-authored-by: Yunus Atahan <yunus.atahan@trmotor.local>
---
 spacy/lang/tr/lex_attrs.py       | 2 +-
 spacy/tests/lang/tr/test_text.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py
index f7416837d..6d9f4f388 100644
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@@ -53,7 +53,7 @@ _ordinal_words = [
     "doksanıncı",
     "yüzüncü",
     "bininci",
-    "mliyonuncu",
+    "milyonuncu",
     "milyarıncı",
     "trilyonuncu",
     "katrilyonuncu",
diff --git a/spacy/tests/lang/tr/test_text.py b/spacy/tests/lang/tr/test_text.py
index a12971e82..323b11bd1 100644
--- a/spacy/tests/lang/tr/test_text.py
+++ b/spacy/tests/lang/tr/test_text.py
@@ -41,7 +41,7 @@ def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
     assert like_num(word)
 
 
-@pytest.mark.parametrize("word", ["beş", "yedi", "yedinci", "birinci"])
+@pytest.mark.parametrize("word", ["beş", "yedi", "yedinci", "birinci", "milyonuncu"])
 def test_tr_lex_attrs_capitals(word):
     assert like_num(word)
     assert like_num(word.upper())

From 88933ca878fe66be1c1dc76f6bc8981546520814 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 31 Mar 2022 14:16:21 +0200
Subject: [PATCH 088/123] Revert "Add click pin to avoid typer issues (#10573)"

This reverts commit 9966e08f32cf59963591b589bf816aa114ed39a9.
---
 requirements.txt | 1 -
 setup.cfg        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a9cdbb64e..7b9d343a9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,6 @@ wasabi>=0.8.1,<1.1.0
 srsly>=2.4.1,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.5.0
-click<8.1.0
 pathy>=0.3.5
 # Third party dependencies
 numpy>=1.15.0
diff --git a/setup.cfg b/setup.cfg
index e56b27357..3c5ba884a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -52,7 +52,6 @@ install_requires =
     srsly>=2.4.1,<3.0.0
     catalogue>=2.0.6,<2.1.0
     typer>=0.3.0,<0.5.0
-    click<8.1.0
     pathy>=0.3.5
     # Third-party dependencies
     tqdm>=4.38.0,<5.0.0

From e3ccc1973bbf4e7df43528e1a74c83fffb155f0a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 31 Mar 2022 15:11:32 +0200
Subject: [PATCH 089/123] Provide debug data info for floret vectors (#10592)

---
 spacy/cli/debug_data.py | 49 ++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index a63795148..f94319d1d 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -19,6 +19,7 @@ from ..morphology import Morphology
 from ..language import Language
 from ..util import registry, resolve_dot_names
 from ..compat import Literal
+from ..vectors import Mode as VectorsMode
 from .. import util
 
 
@@ -170,26 +171,34 @@ def debug_data(
         show=verbose,
     )
     if len(nlp.vocab.vectors):
-        msg.info(
-            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
-            f"unique keys, {nlp.vocab.vectors_length} dimensions)"
-        )
-        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
-        msg.warn(
-            "{} words in training data without vectors ({:.0f}%)".format(
-                n_missing_vectors,
-                100 * (n_missing_vectors / gold_train_data["n_words"]),
-            ),
-        )
-        msg.text(
-            "10 most common words without vectors: {}".format(
-                _format_labels(
-                    gold_train_data["words_missing_vectors"].most_common(10),
-                    counts=True,
-                )
-            ),
-            show=verbose,
-        )
+        if nlp.vocab.vectors.mode == VectorsMode.floret:
+            msg.info(
+                f"floret vectors with {len(nlp.vocab.vectors)} vectors, "
+                f"{nlp.vocab.vectors_length} dimensions, "
+                f"{nlp.vocab.vectors.minn}-{nlp.vocab.vectors.maxn} char "
+                f"n-gram subwords"
+            )
+        else:
+            msg.info(
+                f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
+                f"unique keys, {nlp.vocab.vectors_length} dimensions)"
+            )
+            n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
+            msg.warn(
+                "{} words in training data without vectors ({:.0f}%)".format(
+                    n_missing_vectors,
+                    100 * (n_missing_vectors / gold_train_data["n_words"]),
+                ),
+            )
+            msg.text(
+                "10 most common words without vectors: {}".format(
+                    _format_labels(
+                        gold_train_data["words_missing_vectors"].most_common(10),
+                        counts=True,
+                    )
+                ),
+                show=verbose,
+            )
     else:
         msg.info("No word vectors present in the package")
 

From 03762b4b926c574a14727c8a6777356c35eb5d7c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 1 Apr 2022 09:01:04 +0200
Subject: [PATCH 090/123] Add spancat, trainable_lemmatizer to quickstart
 (#10524)

* Add `SPACY` and `IS_SPACE` as default `tok2vec` features
---
 spacy/cli/templates/quickstart_training.jinja | 111 +++++++++++++++++-
 website/src/widgets/quickstart-training.js    |   2 +-
 2 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index b84fb3a8f..ae11dcafc 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -3,6 +3,7 @@ the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
 {%- set use_transformer = hardware != "cpu" -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
+{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
 [paths]
 train = null
 dev = null
@@ -24,10 +25,10 @@ lang = "{{ lang }}"
 {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
 {%- set with_accuracy = optimize == "accuracy" -%}
 {%- set has_accurate_textcat = has_textcat and with_accuracy -%}
-{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
-{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
+{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
 {%- else -%}
-{%- set full_pipeline = components %}
+{%- set full_pipeline = components -%}
 {%- endif %}
 pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
 batch_size = {{ 128 if hardware == "gpu" else 1000 }}
@@ -123,6 +124,60 @@ grad_factor = 1.0
 @layers = "reduce_mean.v1"
 {% endif -%}
 
+{% if "spancat" in components -%}
+[components.spancat]
+factory = "spancat"
+max_positive = null
+scorer = {"@scorers":"spacy.spancat_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.spancat.model]
+@architectures = "spacy.SpanCategorizer.v1"
+
+[components.spancat.model.reducer]
+@layers = "spacy.mean_max_reducer.v1"
+hidden_size = 128
+
+[components.spancat.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = null
+nI = null
+
+[components.spancat.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.spancat.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+
+[components.spancat.suggester]
+@misc = "spacy.ngram_suggester.v1"
+sizes = [1,2,3]
+{% endif -%}
+
+{% if "trainable_lemmatizer" in components -%}
+[components.trainable_lemmatizer]
+factory = "trainable_lemmatizer"
+backoff = "orth"
+min_tree_freq = 3
+overwrite = false
+scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
+top_k = 1
+
+[components.trainable_lemmatizer.model]
+@architectures = "spacy.Tagger.v2"
+nO = null
+normalize = false
+
+[components.trainable_lemmatizer.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.trainable_lemmatizer.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{% endif -%}
+
 {% if "entity_linker" in components -%}
 [components.entity_linker]
 factory = "entity_linker"
@@ -295,6 +350,54 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 {% endif %}
 
+{% if "spancat" in components %}
+[components.spancat]
+factory = "spancat"
+max_positive = null
+scorer = {"@scorers":"spacy.spancat_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.spancat.model]
+@architectures = "spacy.SpanCategorizer.v1"
+
+[components.spancat.model.reducer]
+@layers = "spacy.mean_max_reducer.v1"
+hidden_size = 128
+
+[components.spancat.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = null
+nI = null
+
+[components.spancat.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+
+[components.spancat.suggester]
+@misc = "spacy.ngram_suggester.v1"
+sizes = [1,2,3]
+{% endif %}
+
+{% if "trainable_lemmatizer" in components -%}
+[components.trainable_lemmatizer]
+factory = "trainable_lemmatizer"
+backoff = "orth"
+min_tree_freq = 3
+overwrite = false
+scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
+top_k = 1
+
+[components.trainable_lemmatizer.model]
+@architectures = "spacy.Tagger.v2"
+nO = null
+normalize = false
+
+[components.trainable_lemmatizer.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{% endif -%}
+
 {% if "entity_linker" in components -%}
 [components.entity_linker]
 factory = "entity_linker"
@@ -369,7 +472,7 @@ no_output_layer = false
 {% endif %}
 
 {% for pipe in components %}
-{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %}
+{% if pipe not in listener_components %}
 {# Other components defined by the user: we just assume they're factories #}
 [components.{{ pipe }}]
 factory = "{{ pipe }}"
diff --git a/website/src/widgets/quickstart-training.js b/website/src/widgets/quickstart-training.js
index 2d3a0e679..fbeeaf79d 100644
--- a/website/src/widgets/quickstart-training.js
+++ b/website/src/widgets/quickstart-training.js
@@ -10,7 +10,7 @@ const DEFAULT_LANG = 'en'
 const DEFAULT_HARDWARE = 'cpu'
 const DEFAULT_OPT = 'efficiency'
 const DEFAULT_TEXTCAT_EXCLUSIVE = true
-const COMPONENTS = ['tagger', 'morphologizer', 'parser', 'ner', 'textcat']
+const COMPONENTS = ['tagger', 'morphologizer', 'trainable_lemmatizer', 'parser', 'ner', 'spancat', 'textcat']
 const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train'
 # you can run spacy init fill-config to auto-fill all default settings:
 # python -m spacy init fill-config ./base_config.cfg ./config.cfg`

From c90dd6f265e5cef84de3a2a935728898e20b94df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 1 Apr 2022 09:02:06 +0200
Subject: [PATCH 091/123] Alignment: use a simplified ragged type for
 performance (#10319)

* Alignment: use a simplified ragged type for performance

This introduces the AlignmentArray type, which is a simplified version
of Ragged that performs better on the simple(r) indexing performed for
alignment.

* AlignmentArray: raise an error when using unsupported index

* AlignmentArray: move error messages to Errors

* AlignmentArray: remove simlified ... with simplifications

* AlignmentArray: fix typo that broke a[n:n] indexing
---
 setup.py                                      |  1 +
 spacy/errors.py                               |  2 +
 .../pipeline/_parser_internals/arc_eager.pyx  |  2 +-
 spacy/scorer.py                               | 10 +--
 spacy/tests/training/test_training.py         | 81 +++++++++++++------
 spacy/training/alignment.py                   | 19 ++---
 spacy/training/alignment_array.pxd            |  7 ++
 spacy/training/alignment_array.pyx            | 68 ++++++++++++++++
 spacy/training/example.pyx                    | 12 +--
 9 files changed, 152 insertions(+), 50 deletions(-)
 create mode 100644 spacy/training/alignment_array.pxd
 create mode 100644 spacy/training/alignment_array.pyx

diff --git a/setup.py b/setup.py
index a5748e9b4..9023b9fa3 100755
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@ Options.docstrings = True
 
 PACKAGES = find_packages()
 MOD_NAMES = [
+    "spacy.training.alignment_array",
     "spacy.training.example",
     "spacy.parts_of_speech",
     "spacy.strings",
diff --git a/spacy/errors.py b/spacy/errors.py
index a0cd2ef34..24a9f0339 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -897,6 +897,8 @@ class Errors(metaclass=ErrorsWithCodes):
     E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
              "supported values are: 'I', 'O', 'B' and ''")
     E1026 = ("Edit tree has an invalid format:\n{errors}")
+    E1027 = ("AlignmentArray only supports slicing with a step of 1.")
+    E1028 = ("AlignmentArray only supports indexing using an int or a slice.")
     
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 029e2e29e..f1165592e 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -218,7 +218,7 @@ def _get_aligned_sent_starts(example):
         sent_starts = [False] * len(example.x)
         seen_words = set()
         for y_sent in example.y.sents:
-            x_indices = list(align[y_sent.start : y_sent.end].dataXd)
+            x_indices = list(align[y_sent.start : y_sent.end])
             if any(x_idx in seen_words for x_idx in x_indices):
                 # If there are any tokens in X that align across two sentences,
                 # regard the sentence annotations as missing, as we can't
diff --git a/spacy/scorer.py b/spacy/scorer.py
index ae9338bd5..8cd755ac4 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -228,7 +228,7 @@ class Scorer:
                 if token.orth_.isspace():
                     continue
                 if align.x2y.lengths[token.i] == 1:
-                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    gold_i = align.x2y[token.i][0]
                     if gold_i not in missing_indices:
                         pred_tags.add((gold_i, getter(token, attr)))
             tag_score.score_set(pred_tags, gold_tags)
@@ -287,7 +287,7 @@ class Scorer:
                 if token.orth_.isspace():
                     continue
                 if align.x2y.lengths[token.i] == 1:
-                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    gold_i = align.x2y[token.i][0]
                     if gold_i not in missing_indices:
                         value = getter(token, attr)
                         morph = gold_doc.vocab.strings[value]
@@ -694,13 +694,13 @@ class Scorer:
                 if align.x2y.lengths[token.i] != 1:
                     gold_i = None  # type: ignore
                 else:
-                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    gold_i = align.x2y[token.i][0]
                 if gold_i not in missing_indices:
                     dep = getter(token, attr)
                     head = head_getter(token, head_attr)
                     if dep not in ignore_labels and token.orth_.strip():
                         if align.x2y.lengths[head.i] == 1:
-                            gold_head = align.x2y[head.i].dataXd[0, 0]
+                            gold_head = align.x2y[head.i][0]
                         else:
                             gold_head = None
                         # None is indistinct, so we can't just add it to the set
@@ -750,7 +750,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
         for pred_ent in eg.x.ents:
             if pred_ent.label_ not in score_per_type:
                 score_per_type[pred_ent.label_] = PRFScore()
-            indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
+            indices = align_x2y[pred_ent.start : pred_ent.end]
             if len(indices):
                 g_span = eg.y[indices[0] : indices[-1] + 1]
                 # Check we aren't missing annotation on this span. If so,
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index f1f8ce9d4..8e08a25fb 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -8,6 +8,7 @@ from spacy.tokens import Doc, DocBin
 from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
 from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
 from spacy.training import offsets_to_biluo_tags
+from spacy.training.alignment_array import AlignmentArray
 from spacy.training.align import get_alignments
 from spacy.training.converters import json_to_docs
 from spacy.util import get_words_and_spaces, load_model_from_path, minibatch
@@ -908,9 +909,41 @@ def test_alignment():
     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
     assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
-    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6]
+    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 6]
     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
-    assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7]
+    assert list(align.y2x.data) == [0, 1, 2, 3, 4, 5, 6, 7]
+
+
+def test_alignment_array():
+    a = AlignmentArray([[0, 1, 2], [3], [], [4, 5, 6, 7], [8, 9]])
+    assert list(a.data) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    assert list(a.lengths) == [3, 1, 0, 4, 2]
+    assert list(a[3]) == [4, 5, 6, 7]
+    assert list(a[2]) == []
+    assert list(a[-2]) == [4, 5, 6, 7]
+    assert list(a[1:4]) == [3, 4, 5, 6, 7]
+    assert list(a[1:]) == [3, 4, 5, 6, 7, 8, 9]
+    assert list(a[:3]) == [0, 1, 2, 3]
+    assert list(a[:]) == list(a.data)
+    assert list(a[0:0]) == []
+    assert list(a[3:3]) == []
+    assert list(a[-1:-1]) == []
+    with pytest.raises(ValueError, match=r"only supports slicing with a step of 1"):
+        a[:4:-1]
+    with pytest.raises(
+        ValueError, match=r"only supports indexing using an int or a slice"
+    ):
+        a[[0, 1, 3]]
+
+    a = AlignmentArray([[], [1, 2, 3], [4, 5]])
+    assert list(a[0]) == []
+    assert list(a[0:1]) == []
+    assert list(a[2]) == [4, 5]
+    assert list(a[0:2]) == [1, 2, 3]
+
+    a = AlignmentArray([[1, 2, 3], [4, 5], []])
+    assert list(a[-1]) == []
+    assert list(a[-2:]) == [4, 5]
 
 
 def test_alignment_case_insensitive():
@@ -918,9 +951,9 @@ def test_alignment_case_insensitive():
     spacy_tokens = ["i", "listened", "to", "Obama", "'s", "PODCASTS", "."]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
     assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
-    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6]
+    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 6]
     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
-    assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7]
+    assert list(align.y2x.data) == [0, 1, 2, 3, 4, 5, 6, 7]
 
 
 def test_alignment_complex():
@@ -928,9 +961,9 @@ def test_alignment_complex():
     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
-    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5]
     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
-    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
+    assert list(align.y2x.data) == [0, 0, 0, 1, 2, 3, 4, 5]
 
 
 def test_alignment_complex_example(en_vocab):
@@ -947,9 +980,9 @@ def test_alignment_complex_example(en_vocab):
     example = Example(predicted, reference)
     align = example.alignment
     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
-    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5]
     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
-    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
+    assert list(align.y2x.data) == [0, 0, 0, 1, 2, 3, 4, 5]
 
 
 def test_alignment_different_texts():
@@ -965,70 +998,70 @@ def test_alignment_spaces(en_vocab):
     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
     assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
-    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5]
     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
-    assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
+    assert list(align.y2x.data) == [1, 1, 1, 2, 3, 4, 5, 6]
 
     # multiple leading whitespace tokens
     other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
     assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
-    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5]
     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
-    assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
+    assert list(align.y2x.data) == [2, 2, 2, 3, 4, 5, 6, 7]
 
     # both with leading whitespace, not identical
     other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
     spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
     assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1]
-    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6]
+    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 5, 5, 6, 6]
     assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2]
-    assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7]
+    assert list(align.y2x.data) == [0, 2, 2, 2, 3, 4, 5, 6, 7]
 
     # same leading whitespace, different tokenization
     other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
     spacy_tokens = ["  ", "i", "listened", "to", "obama", "'s", "podcasts."]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
     assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1]
-    assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6]
+    assert list(align.x2y.data) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6]
     assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2]
-    assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7]
+    assert list(align.y2x.data) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7]
 
     # only one with trailing whitespace
     other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "]
     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0]
-    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5]
     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
-    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
+    assert list(align.y2x.data) == [0, 0, 0, 1, 2, 3, 4, 5]
 
     # different trailing whitespace
     other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0]
-    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6]
+    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5, 6]
     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1]
-    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6]
+    assert list(align.y2x.data) == [0, 0, 0, 1, 2, 3, 4, 5, 6]
 
     # same trailing whitespace, different tokenization
     other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", "  "]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1]
-    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6]
+    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6]
     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2]
-    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]
+    assert list(align.y2x.data) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]
 
     # differing whitespace is allowed
     other_tokens = ["a", " \n ", "b", "c"]
     spacy_tokens = ["a", "b", " ", "c"]
     align = Alignment.from_strings(other_tokens, spacy_tokens)
-    assert list(align.x2y.dataXd) == [0, 1, 3]
-    assert list(align.y2x.dataXd) == [0, 2, 3]
+    assert list(align.x2y.data) == [0, 1, 3]
+    assert list(align.y2x.data) == [0, 2, 3]
 
     # other differences in whitespace are allowed
     other_tokens = [" ", "a"]
diff --git a/spacy/training/alignment.py b/spacy/training/alignment.py
index 3e3b60ca6..6d24714bf 100644
--- a/spacy/training/alignment.py
+++ b/spacy/training/alignment.py
@@ -1,31 +1,22 @@
 from typing import List
-import numpy
-from thinc.types import Ragged
 from dataclasses import dataclass
 
 from .align import get_alignments
+from .alignment_array import AlignmentArray
 
 
 @dataclass
 class Alignment:
-    x2y: Ragged
-    y2x: Ragged
+    x2y: AlignmentArray
+    y2x: AlignmentArray
 
     @classmethod
     def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
-        x2y = _make_ragged(x2y)
-        y2x = _make_ragged(y2x)
+        x2y = AlignmentArray(x2y)
+        y2x = AlignmentArray(y2x)
         return Alignment(x2y=x2y, y2x=y2x)
 
     @classmethod
     def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
         x2y, y2x = get_alignments(A, B)
         return Alignment.from_indices(x2y=x2y, y2x=y2x)
-
-
-def _make_ragged(indices):
-    lengths = numpy.array([len(x) for x in indices], dtype="i")
-    flat = []
-    for x in indices:
-        flat.extend(x)
-    return Ragged(numpy.array(flat, dtype="i"), lengths)
diff --git a/spacy/training/alignment_array.pxd b/spacy/training/alignment_array.pxd
new file mode 100644
index 000000000..056f5bef3
--- /dev/null
+++ b/spacy/training/alignment_array.pxd
@@ -0,0 +1,7 @@
+from libcpp.vector cimport vector
+cimport numpy as np
+
+cdef class AlignmentArray:
+    cdef np.ndarray _data
+    cdef np.ndarray _lengths
+    cdef np.ndarray _starts_ends
diff --git a/spacy/training/alignment_array.pyx b/spacy/training/alignment_array.pyx
new file mode 100644
index 000000000..b58f08786
--- /dev/null
+++ b/spacy/training/alignment_array.pyx
@@ -0,0 +1,68 @@
+from typing import List
+from ..errors import Errors
+import numpy
+
+
+cdef class AlignmentArray:
+    """AlignmentArray is similar to Thinc's Ragged with two simplfications:
+    indexing returns numpy arrays and this type can only be used for CPU arrays.
+    However, these changes make AlginmentArray more efficient for indexing in a
+    tight loop."""
+
+    __slots__ = []
+
+    def __init__(self, alignment: List[List[int]]):
+        self._lengths = None
+        self._starts_ends = numpy.zeros(len(alignment) + 1, dtype="i")
+
+        cdef int data_len = 0
+        cdef int outer_len
+        cdef int idx
+        for idx, outer in enumerate(alignment):
+            outer_len = len(outer)
+            self._starts_ends[idx + 1] = self._starts_ends[idx] + outer_len
+            data_len += outer_len
+
+        self._data = numpy.empty(data_len, dtype="i")
+        idx = 0
+        for outer in alignment:
+            for inner in outer:
+                self._data[idx] = inner
+                idx += 1
+
+    def __getitem__(self, idx):
+        starts = self._starts_ends[:-1]
+        ends = self._starts_ends[1:]
+        if isinstance(idx, int):
+            start = starts[idx]
+            end = ends[idx]
+        elif isinstance(idx, slice):
+            if not (idx.step is None or idx.step == 1):
+                raise ValueError(Errors.E1027)
+            start = starts[idx]
+            if len(start) == 0:
+                return self._data[0:0]
+            start = start[0]
+            end = ends[idx][-1]
+        else:
+            raise ValueError(Errors.E1028)
+
+        return self._data[start:end]
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def lengths(self):
+        if self._lengths is None:
+            self._lengths = self.ends - self.starts
+        return self._lengths
+
+    @property
+    def ends(self):
+        return self._starts_ends[1:]
+
+    @property
+    def starts(self):
+        return self._starts_ends[:-1]
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 778dfd12a..ab92f78c6 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -159,7 +159,7 @@ cdef class Example:
         gold_values = self.reference.to_array([field])
         output = [None] * len(self.predicted)
         for token in self.predicted:
-            values = gold_values[align[token.i].dataXd]
+            values = gold_values[align[token.i]]
             values = values.ravel()
             if len(values) == 0:
                 output[token.i] = None
@@ -190,9 +190,9 @@ cdef class Example:
             deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
         for cand_i in range(self.x.length):
             if cand_to_gold.lengths[cand_i] == 1:
-                gold_i = cand_to_gold[cand_i].dataXd[0, 0]
+                gold_i = cand_to_gold[cand_i][0]
                 if gold_to_cand.lengths[heads[gold_i]] == 1:
-                    aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0])
+                    aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]][0])
                     aligned_deps[cand_i] = deps[gold_i]
         return aligned_heads, aligned_deps
 
@@ -204,7 +204,7 @@ cdef class Example:
             align = self.alignment.y2x
             sent_starts = [False] * len(self.x)
             for y_sent in self.y.sents:
-                x_start = int(align[y_sent.start].dataXd[0])
+                x_start = int(align[y_sent.start][0])
                 sent_starts[x_start] = True
             return sent_starts
         else:
@@ -220,7 +220,7 @@ cdef class Example:
         seen = set()
         output = []
         for span in spans:
-            indices = align[span.start : span.end].data.ravel()
+            indices = align[span.start : span.end]
             if not allow_overlap:
                 indices = [idx for idx in indices if idx not in seen]
             if len(indices) >= 1:
@@ -316,7 +316,7 @@ cdef class Example:
         seen_indices = set()
         output = []
         for y_sent in self.reference.sents:
-            indices = align[y_sent.start : y_sent.end].data.ravel()
+            indices = align[y_sent.start : y_sent.end]
             indices = [idx for idx in indices if idx not in seen_indices]
             if indices:
                 x_sent = self.predicted[indices[0] : indices[-1] + 1]

From d56b1400d2899bc85ab8108e2dbf2e4885c4e6ce Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 1 Apr 2022 09:54:52 +0200
Subject: [PATCH 092/123] Set version to v3.3.0.dev0

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index d01b278c9..1985ba342 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.2.2"
+__version__ = "3.3.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From ca54de27bb5c8fa1579e0730a576b60e153f8a8e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 1 Apr 2022 09:56:26 +0200
Subject: [PATCH 093/123] Support more internal methods for SpanGroup (#10476)

* Added new convenience cython functions to SpanGroup to avoid unnecessary allocation/deallocation of objects

* Replaced sorting in has_overlap with C++ for efficiency. Also, added a test for has_overlap

* Added a method to efficiently merge SpanGroups

* Added __delitem__, __add__ and __iadd__. Also, allowed to pass span lists to merge function. Replaced extend() body with call to merge

* Renamed merge to concat and added missing things to documentation

* Added operator+ and operator += in the documentation

* Added a test for Doc deallocation

* Update spacy/tokens/span_group.pyx

* Updated SpanGroup tests to use new span list comparison function rather than assert_span_list_equal, eliminating the need to have a separate assert_not_equal fnction

* Fixed typos in SpanGroup documentation

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Minor changes requested by Sofie: rearranged import statements. Added new=3.2.1 tag to SpanGroup.__setitem__ documentation

* SpanGroup: moved repetitive list index check/adjustment in a separate function

* Turn off formatting that hurts readability spacy/tests/doc/test_span_group.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove formatting that hurts readability spacy/tests/doc/test_span_group.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Turn off formatting that hurts readability in spacy/tests/doc/test_span_group.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Support more internal methods for SpanGroup

Add support for:

* `__setitem__`
* `__delitem__`
* `__iadd__`: for `SpanGroup` or `Iterable[Span]`
* `__add__`: for `SpanGroup` only

Adapted from #9698 with the scope limited to the magic methods.

* Use v3.3 as new version in docs

* Add new tag to SpanGroup.copy in API docs

* Remove duplicate import

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remaining suggestions and formatting

Co-authored-by: nrodnova <nrodnova@hotmail.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Natalia Rodnova <4512370+nrodnova@users.noreply.github.com>
---
 spacy/errors.py                    |   3 +
 spacy/tests/doc/test_span_group.py | 242 +++++++++++++++++++++++++++++
 spacy/tokens/span_group.pyx        | 172 +++++++++++++++++---
 website/docs/api/spangroup.md      | 115 +++++++++++++-
 4 files changed, 502 insertions(+), 30 deletions(-)
 create mode 100644 spacy/tests/doc/test_span_group.py

diff --git a/spacy/errors.py b/spacy/errors.py
index 24a9f0339..84eca8016 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -524,6 +524,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
+    E855 = ("Invalid {obj}: {obj} is not from the same doc.")
+    E856 = ("Error accessing span at position {i}: out of bounds in span group "
+            "of length {length}.")
     E857 = ("Entry '{name}' not found in edit tree lemmatizer labels.")
     E858 = ("The {mode} vector table does not support this operation. "
             "{alternative}")
diff --git a/spacy/tests/doc/test_span_group.py b/spacy/tests/doc/test_span_group.py
new file mode 100644
index 000000000..8c70a83e1
--- /dev/null
+++ b/spacy/tests/doc/test_span_group.py
@@ -0,0 +1,242 @@
+import pytest
+from random import Random
+from spacy.matcher import Matcher
+from spacy.tokens import Span, SpanGroup
+
+
+@pytest.fixture
+def doc(en_tokenizer):
+    doc = en_tokenizer("0 1 2 3 4 5 6")
+    matcher = Matcher(en_tokenizer.vocab, validate=True)
+
+    # fmt: off
+    matcher.add("4", [[{}, {}, {}, {}]])
+    matcher.add("2", [[{}, {}, ]])
+    matcher.add("1", [[{}, ]])
+    # fmt: on
+    matches = matcher(doc)
+    spans = []
+    for match in matches:
+        spans.append(
+            Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
+        )
+    Random(42).shuffle(spans)
+    doc.spans["SPANS"] = SpanGroup(
+        doc, name="SPANS", attrs={"key": "value"}, spans=spans
+    )
+    return doc
+
+
+@pytest.fixture
+def other_doc(en_tokenizer):
+    doc = en_tokenizer("0 1 2 3 4 5 6")
+    matcher = Matcher(en_tokenizer.vocab, validate=True)
+
+    # fmt: off
+    matcher.add("4", [[{}, {}, {}, {}]])
+    matcher.add("2", [[{}, {}, ]])
+    matcher.add("1", [[{}, ]])
+    # fmt: on
+
+    matches = matcher(doc)
+    spans = []
+    for match in matches:
+        spans.append(
+            Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
+        )
+    Random(42).shuffle(spans)
+    doc.spans["SPANS"] = SpanGroup(
+        doc, name="SPANS", attrs={"key": "value"}, spans=spans
+    )
+    return doc
+
+
+@pytest.fixture
+def span_group(en_tokenizer):
+    doc = en_tokenizer("0 1 2 3 4 5 6")
+    matcher = Matcher(en_tokenizer.vocab, validate=True)
+
+    # fmt: off
+    matcher.add("4", [[{}, {}, {}, {}]])
+    matcher.add("2", [[{}, {}, ]])
+    matcher.add("1", [[{}, ]])
+    # fmt: on
+
+    matches = matcher(doc)
+    spans = []
+    for match in matches:
+        spans.append(
+            Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
+        )
+    Random(42).shuffle(spans)
+    doc.spans["SPANS"] = SpanGroup(
+        doc, name="SPANS", attrs={"key": "value"}, spans=spans
+    )
+
+
+def test_span_group_copy(doc):
+    span_group = doc.spans["SPANS"]
+    clone = span_group.copy()
+    assert clone != span_group
+    assert clone.name == span_group.name
+    assert clone.attrs == span_group.attrs
+    assert len(clone) == len(span_group)
+    assert list(span_group) == list(clone)
+    clone.name = "new_name"
+    clone.attrs["key"] = "new_value"
+    clone.append(Span(doc, 0, 6, "LABEL"))
+    assert clone.name != span_group.name
+    assert clone.attrs != span_group.attrs
+    assert span_group.attrs["key"] == "value"
+    assert list(span_group) != list(clone)
+
+
+def test_span_group_set_item(doc, other_doc):
+    span_group = doc.spans["SPANS"]
+
+    index = 5
+    span = span_group[index]
+    span.label_ = "NEW LABEL"
+    span.kb_id = doc.vocab.strings["KB_ID"]
+
+    assert span_group[index].label != span.label
+    assert span_group[index].kb_id != span.kb_id
+
+    span_group[index] = span
+    assert span_group[index].start == span.start
+    assert span_group[index].end == span.end
+    assert span_group[index].label == span.label
+    assert span_group[index].kb_id == span.kb_id
+    assert span_group[index] == span
+
+    with pytest.raises(IndexError):
+        span_group[-100] = span
+    with pytest.raises(IndexError):
+        span_group[100] = span
+
+    span = Span(other_doc, 0, 2)
+    with pytest.raises(ValueError):
+        span_group[index] = span
+
+
+def test_span_group_has_overlap(doc):
+    span_group = doc.spans["SPANS"]
+    assert span_group.has_overlap
+
+
+def test_span_group_concat(doc, other_doc):
+    span_group_1 = doc.spans["SPANS"]
+    spans = [doc[0:5], doc[0:6]]
+    span_group_2 = SpanGroup(
+        doc,
+        name="MORE_SPANS",
+        attrs={"key": "new_value", "new_key": "new_value"},
+        spans=spans,
+    )
+    span_group_3 = span_group_1._concat(span_group_2)
+    assert span_group_3.name == span_group_1.name
+    assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
+    span_list_expected = list(span_group_1) + list(span_group_2)
+    assert list(span_group_3) == list(span_list_expected)
+
+    # Inplace
+    span_list_expected = list(span_group_1) + list(span_group_2)
+    span_group_3 = span_group_1._concat(span_group_2, inplace=True)
+    assert span_group_3 == span_group_1
+    assert span_group_3.name == span_group_1.name
+    assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
+    assert list(span_group_3) == list(span_list_expected)
+
+    span_group_2 = other_doc.spans["SPANS"]
+    with pytest.raises(ValueError):
+        span_group_1._concat(span_group_2)
+
+
+def test_span_doc_delitem(doc):
+    span_group = doc.spans["SPANS"]
+    length = len(span_group)
+    index = 5
+    span = span_group[index]
+    next_span = span_group[index + 1]
+    del span_group[index]
+    assert len(span_group) == length - 1
+    assert span_group[index] != span
+    assert span_group[index] == next_span
+
+    with pytest.raises(IndexError):
+        del span_group[-100]
+    with pytest.raises(IndexError):
+        del span_group[100]
+
+
+def test_span_group_add(doc):
+    span_group_1 = doc.spans["SPANS"]
+    spans = [doc[0:5], doc[0:6]]
+    span_group_2 = SpanGroup(
+        doc,
+        name="MORE_SPANS",
+        attrs={"key": "new_value", "new_key": "new_value"},
+        spans=spans,
+    )
+
+    span_group_3_expected = span_group_1._concat(span_group_2)
+
+    span_group_3 = span_group_1 + span_group_2
+    assert len(span_group_3) == len(span_group_3_expected)
+    assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
+    assert list(span_group_3) == list(span_group_3_expected)
+
+
+def test_span_group_iadd(doc):
+    span_group_1 = doc.spans["SPANS"].copy()
+    spans = [doc[0:5], doc[0:6]]
+    span_group_2 = SpanGroup(
+        doc,
+        name="MORE_SPANS",
+        attrs={"key": "new_value", "new_key": "new_value"},
+        spans=spans,
+    )
+
+    span_group_1_expected = span_group_1._concat(span_group_2)
+
+    span_group_1 += span_group_2
+    assert len(span_group_1) == len(span_group_1_expected)
+    assert span_group_1.attrs == {"key": "value", "new_key": "new_value"}
+    assert list(span_group_1) == list(span_group_1_expected)
+
+    span_group_1 = doc.spans["SPANS"].copy()
+    span_group_1 += spans
+    assert len(span_group_1) == len(span_group_1_expected)
+    assert span_group_1.attrs == {
+        "key": "value",
+    }
+    assert list(span_group_1) == list(span_group_1_expected)
+
+
+def test_span_group_extend(doc):
+    span_group_1 = doc.spans["SPANS"].copy()
+    spans = [doc[0:5], doc[0:6]]
+    span_group_2 = SpanGroup(
+        doc,
+        name="MORE_SPANS",
+        attrs={"key": "new_value", "new_key": "new_value"},
+        spans=spans,
+    )
+
+    span_group_1_expected = span_group_1._concat(span_group_2)
+
+    span_group_1.extend(span_group_2)
+    assert len(span_group_1) == len(span_group_1_expected)
+    assert span_group_1.attrs == {"key": "value", "new_key": "new_value"}
+    assert list(span_group_1) == list(span_group_1_expected)
+
+    span_group_1 = doc.spans["SPANS"]
+    span_group_1.extend(spans)
+    assert len(span_group_1) == len(span_group_1_expected)
+    assert span_group_1.attrs == {"key": "value"}
+    assert list(span_group_1) == list(span_group_1_expected)
+
+
+def test_span_group_dealloc(span_group):
+    with pytest.raises(AttributeError):
+        print(span_group.doc)
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 6cfa75237..1c09f4ea2 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -1,10 +1,11 @@
+from typing import Iterable, Tuple, Union, Optional, TYPE_CHECKING
 import weakref
 import struct
+from copy import deepcopy
 import srsly
 
 from spacy.errors import Errors
 from .span cimport Span
-from libc.stdint cimport uint64_t, uint32_t, int32_t
 
 
 cdef class SpanGroup:
@@ -48,6 +49,8 @@ cdef class SpanGroup:
         self.name = name
         self.attrs = dict(attrs) if attrs is not None else {}
         cdef Span span
+        if len(spans) :
+            self.c.reserve(len(spans))
         for span in spans:
             self.push_back(span.c)
 
@@ -89,6 +92,72 @@ cdef class SpanGroup:
         """
         return self.c.size()
 
+    def __getitem__(self, int i) -> Span:
+        """Get a span from the group. Note that a copy of the span is returned,
+        so if any changes are made to this span, they are not reflected in the
+        corresponding member of the span group.
+
+        i (int): The item index.
+        RETURNS (Span): The span at the given index.
+
+        DOCS: https://spacy.io/api/spangroup#getitem
+        """
+        i = self._normalize_index(i)
+        return Span.cinit(self.doc, self.c[i])
+
+    def __delitem__(self, int i):
+        """Delete a span from the span group at index i.
+
+        i (int): The item index.
+
+        DOCS: https://spacy.io/api/spangroup#delitem
+        """
+        i = self._normalize_index(i)
+        self.c.erase(self.c.begin() + i - 1)
+
+    def __setitem__(self, int i, Span span):
+        """Set a span in the span group.
+
+        i (int): The item index.
+        span (Span): The span.
+
+        DOCS: https://spacy.io/api/spangroup#setitem
+        """
+        if span.doc is not self.doc:
+            raise ValueError(Errors.E855.format(obj="span"))
+
+        i = self._normalize_index(i)
+        self.c[i] = span.c
+
+    def __iadd__(self, other: Union[SpanGroup, Iterable["Span"]]) -> SpanGroup:
+        """Operator +=. Append a span group or spans to this group and return
+        the current span group.
+
+        other (Union[SpanGroup, Iterable["Span"]]): The SpanGroup or spans to
+            add.
+
+        RETURNS (SpanGroup): The current span group.
+
+        DOCS: https://spacy.io/api/spangroup#iadd
+        """
+        return self._concat(other, inplace=True)
+
+    def __add__(self, other: SpanGroup) -> SpanGroup:
+        """Operator +. Concatenate a span group with this group and return a
+        new span group.
+
+        other (SpanGroup): The SpanGroup to add.
+
+        RETURNS (SpanGroup): The concatenated SpanGroup.
+
+        DOCS: https://spacy.io/api/spangroup#add
+        """
+        # For Cython 0.x and __add__, you cannot rely on `self` as being `self`
+        # or being the right type, so both types need to be checked explicitly.
+        if isinstance(self, SpanGroup) and isinstance(other, SpanGroup):
+            return self._concat(other)
+        return NotImplemented
+
     def append(self, Span span):
         """Add a span to the group. The span must refer to the same Doc
         object as the span group.
@@ -98,35 +167,18 @@ cdef class SpanGroup:
         DOCS: https://spacy.io/api/spangroup#append
         """
         if span.doc is not self.doc:
-            raise ValueError("Cannot add span to group: refers to different Doc.")
+            raise ValueError(Errors.E855.format(obj="span"))
         self.push_back(span.c)
 
-    def extend(self, spans):
-        """Add multiple spans to the group. All spans must refer to the same
-        Doc object as the span group.
+    def extend(self, spans_or_span_group: Union[SpanGroup, Iterable["Span"]]):
+        """Add multiple spans or contents of another SpanGroup to the group.
+        All spans must refer to the same Doc object as the span group.
 
-        spans (Iterable[Span]): The spans to add.
+        spans (Union[SpanGroup, Iterable["Span"]]): The spans to add.
 
         DOCS: https://spacy.io/api/spangroup#extend
         """
-        cdef Span span
-        for span in spans:
-            self.append(span)
-
-    def __getitem__(self, int i):
-        """Get a span from the group.
-
-        i (int): The item index.
-        RETURNS (Span): The span at the given index.
-
-        DOCS: https://spacy.io/api/spangroup#getitem
-        """
-        cdef int size = self.c.size()
-        if i < -size or i >= size:
-            raise IndexError(f"list index {i} out of range")
-        if i < 0:
-            i += size
-        return Span.cinit(self.doc, self.c[i])
+        self._concat(spans_or_span_group, inplace=True)
 
     def to_bytes(self):
         """Serialize the SpanGroup's contents to a byte string.
@@ -136,6 +188,7 @@ cdef class SpanGroup:
         DOCS: https://spacy.io/api/spangroup#to_bytes
         """
         output = {"name": self.name, "attrs": self.attrs, "spans": []}
+        cdef int i
         for i in range(self.c.size()):
             span = self.c[i]
             # The struct.pack here is probably overkill, but it might help if
@@ -187,3 +240,74 @@ cdef class SpanGroup:
 
     cdef void push_back(self, SpanC span) nogil:
         self.c.push_back(span)
+
+    def copy(self)  -> SpanGroup:
+        """Clones the span group.
+
+        RETURNS (SpanGroup): A copy of the span group.
+
+        DOCS: https://spacy.io/api/spangroup#copy
+        """
+        return SpanGroup(
+            self.doc,
+            name=self.name,
+            attrs=deepcopy(self.attrs),
+            spans=list(self),
+        )
+
+    def _concat(
+        self,
+        other: Union[SpanGroup, Iterable["Span"]],
+        *,
+        inplace: bool = False,
+    ) -> SpanGroup:
+        """Concatenates the current span group with the provided span group or
+        spans, either in place or creating a copy. Preserves the name of self,
+        updates attrs only with values that are not in self.
+
+        other (Union[SpanGroup, Iterable[Span]]): The spans to append.
+        inplace (bool): Indicates whether the operation should be performed in
+            place on the current span group.
+
+        RETURNS (SpanGroup): Either a new SpanGroup or the current SpanGroup
+        depending on the value of inplace.
+        """
+        cdef SpanGroup span_group = self if inplace else self.copy()
+        cdef SpanGroup other_group
+        cdef Span span
+
+        if isinstance(other, SpanGroup):
+            other_group = other
+            if other_group.doc is not self.doc:
+                raise ValueError(Errors.E855.format(obj="span group"))
+
+            other_attrs = deepcopy(other_group.attrs)
+            span_group.attrs.update({
+                key: value for key, value in other_attrs.items() \
+                if key not in span_group.attrs
+            })
+            if len(other_group):
+                span_group.c.reserve(span_group.c.size() + other_group.c.size())
+                span_group.c.insert(span_group.c.end(), other_group.c.begin(), other_group.c.end())
+        else:
+            if len(other):
+                span_group.c.reserve(self.c.size() + len(other))
+            for span in other:
+                if span.doc is not self.doc:
+                    raise ValueError(Errors.E855.format(obj="span"))
+                span_group.c.push_back(span.c)
+
+        return span_group
+
+    def _normalize_index(self, int i) -> int:
+        """Checks list index boundaries and adjusts the index if negative.
+
+        i (int): The index.
+        RETURNS (int): The adjusted index.
+        """
+        cdef int length = self.c.size()
+        if i < -length or i >= length:
+            raise IndexError(Errors.E856.format(i=i, length=length))
+        if i < 0:
+            i += length
+        return i
diff --git a/website/docs/api/spangroup.md b/website/docs/api/spangroup.md
index 654067eb1..337e61749 100644
--- a/website/docs/api/spangroup.md
+++ b/website/docs/api/spangroup.md
@@ -104,7 +104,10 @@ Get the number of spans in the group.
 
 ## SpanGroup.\_\_getitem\_\_ {#getitem tag="method"}
 
-Get a span from the group.
+Get a span from the group. Note that a copy of the span is returned, so if any
+changes are made to this span, they are not reflected in the corresponding
+member of the span group. The item or group will need to be reassigned for
+changes to be reflected in the span group.
 
 > #### Example
 >
@@ -113,6 +116,8 @@ Get a span from the group.
 > doc.spans["errors"] = [doc[0:1], doc[2:4]]
 > span = doc.spans["errors"][1]
 > assert span.text == "goi ng"
+> span.label_ = 'LABEL'
+> assert doc.spans["errors"][1] != 'LABEL' # The span within the group was not updated
 > ```
 
 | Name        | Description                           |
@@ -120,6 +125,83 @@ Get a span from the group.
 | `i`         | The item index. ~~int~~               |
 | **RETURNS** | The span at the given index. ~~Span~~ |
 
+## SpanGroup.\_\_setitem\_\_ {#setitem tag="method", new="3.3"}
+
+Set a span in the span group.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> span = doc[0:2]
+> doc.spans["errors"][0] = span
+> assert doc.spans["errors"][0].text == "Their goi"
+> ```
+
+| Name   | Description             |
+| ------ | ----------------------- |
+| `i`    | The item index. ~~int~~ |
+| `span` | The new value. ~~Span~~ |
+
+## SpanGroup.\_\_delitem\_\_ {#delitem tag="method", new="3.3"}
+
+Delete a span from the span group.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> del doc.spans[0]
+> assert len(doc.spans["errors"]) == 1
+> ```
+
+| Name | Description             |
+| ---- | ----------------------- |
+| `i`  | The item index. ~~int~~ |
+
+## SpanGroup.\_\_add\_\_ {#add tag="method", new="3.3"}
+
+Concatenate the current span group with another span group and return the result
+in a new span group. Any `attrs` from the first span group will have precedence
+over `attrs` in the second.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["other"] = [doc[0:2], doc[1:3]]
+> span_group = doc.spans["errors"] + doc.spans["other"]
+> assert len(span_group) == 4
+> ```
+
+| Name        | Description                                                                  |
+| ----------- | ---------------------------------------------------------------------------- |
+| `other`     | The span group or spans to concatenate. ~~Union[SpanGroup, Iterable[Span]]~~ |
+| **RETURNS** | The new span group. ~~SpanGroup~~                                            |
+
+## SpanGroup.\_\_iadd\_\_ {#iadd tag="method", new="3.3"}
+
+Append an iterable of spans or the content of a span group to the current span
+group. Any `attrs` in the other span group will be added for keys that are not
+already present in the current span group.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] += [doc[3:4], doc[2:3]]
+> assert len(doc.spans["errors"]) == 4
+> ```
+
+| Name        | Description                                                             |
+| ----------- | ----------------------------------------------------------------------- |
+| `other`     | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
+| **RETURNS** | The span group. ~~SpanGroup~~                                           |
+
 ## SpanGroup.append {#append tag="method"}
 
 Add a [`Span`](/api/span) object to the group. The span must refer to the same
@@ -140,8 +222,9 @@ Add a [`Span`](/api/span) object to the group. The span must refer to the same
 
 ## SpanGroup.extend {#extend tag="method"}
 
-Add multiple [`Span`](/api/span) objects to the group. All spans must refer to
-the same [`Doc`](/api/doc) object as the span group.
+Add multiple [`Span`](/api/span) objects or contents of another `SpanGroup` to
+the group. All spans must refer to the same [`Doc`](/api/doc) object as the span
+group.
 
 > #### Example
 >
@@ -150,11 +233,31 @@ the same [`Doc`](/api/doc) object as the span group.
 > doc.spans["errors"] = []
 > doc.spans["errors"].extend([doc[2:4], doc[0:1]])
 > assert len(doc.spans["errors"]) == 2
+> span_group = SpanGroup([doc[1:4], doc[0:3])
+> doc.spans["errors"].extend(span_group)
 > ```
 
-| Name    | Description                          |
-| ------- | ------------------------------------ |
-| `spans` | The spans to add. ~~Iterable[Span]~~ |
+| Name    | Description                                              |
+| ------- | -------------------------------------------------------- |
+| `spans` | The spans to add. ~~Union[SpanGroup, Iterable["Span"]]~~ |
+
+## SpanGroup.copy {#copy tag="method", new="3.3"}
+
+Return a copy of the span group.
+
+> #### Example
+>
+> ```python
+> from spacy.tokens import SpanGroup
+>
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[2:4], doc[0:3]]
+> new_group = doc.spans["errors"].copy()
+> ```
+
+| Name        | Description                                     |
+| ----------- | ----------------------------------------------- |
+| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
 
 ## SpanGroup.to_bytes {#to_bytes tag="method"}
 

From e422101e004a6211d5b05942c36698287d545383 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 1 Apr 2022 10:42:25 +0200
Subject: [PATCH 094/123] Temporarily skip tests that require models/compat

---
 .github/azure-steps.yml | 34 +++++++++++++++++-----------------
 spacy/tests/test_cli.py |  1 +
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 80c88b0b8..742182bbe 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -64,12 +64,12 @@ steps:
     displayName: "Run GPU tests"
     condition: eq(${{ parameters.gpu }}, true)
 
-  - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
-      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -m spacy download ca_core_news_sm
+#      python -m spacy download ca_core_news_md
+#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#    displayName: 'Test download CLI'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@@ -93,17 +93,17 @@ steps:
     displayName: 'Test train CLI'
     condition: eq(variables['python_version'], '3.8')
 
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#    displayName: 'Test assemble CLI'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#    displayName: 'Test assemble CLI vectors warning'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python .github/validate_universe_json.py website/meta/universe.json
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index ec512b839..5e431d5cb 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -585,6 +585,7 @@ def test_download_compatibility():
         assert get_minor_version(about.__version__) == get_minor_version(version)
 
 
+@pytest.mark.skip(reason="Temporarily skip for dev version")
 def test_validate_compatibility_table():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False

From cfd9217bae4409a06b2150b70e58a16625414e5d Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Mon, 4 Apr 2022 17:35:37 +0200
Subject: [PATCH 095/123] Update link to flake8 config (#10620)

* Update link to flake8 config

* Run prettier
---
 CONTRIBUTING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9a7d0744a..ddd833be1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -144,7 +144,7 @@ Changes to `.py` files will be effective immediately.
 
 When fixing a bug, first create an
 [issue](https://github.com/explosion/spaCy/issues) if one does not already
-exist.  The description text can be very short – we don't want to make this too
+exist. The description text can be very short – we don't want to make this too
 bureaucratic.
 
 Next, add a test to the relevant file in the
@@ -233,7 +233,7 @@ also want to keep an eye on unused declared variables or repeated
 (i.e. overwritten) dictionary keys. If your code was formatted with `black`
 (see above), you shouldn't see any formatting-related warnings.
 
-The [`.flake8`](.flake8) config defines the configuration we use for this
+The `flake8` section in [`setup.cfg`](setup.cfg) defines the configuration we use for this
 codebase. For example, we're not super strict about the line length, and we're
 excluding very large files like lemmatization and tokenizer exception tables.
 

From f966bf6a154efdbffbabd9fe40f678f5c7443539 Mon Sep 17 00:00:00 2001
From: Bram Vanroy <Bram.Vanroy@UGent.be>
Date: Mon, 4 Apr 2022 17:57:52 +0200
Subject: [PATCH 096/123] Update to spacy_conll in universe (#10617)

* update to spacy_conll

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/meta/universe.json

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/meta/universe.json | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index be95ac083..5665f6518 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2405,16 +2405,15 @@
         {
             "id": "spacy-conll",
             "title": "spacy_conll",
-            "slogan": "Parsing to CoNLL with spaCy, spacy-stanza, and spacy-udpipe",
-            "description": "This module allows you to parse text into CoNLL-U format. You can use it as a command line tool, or embed it in your own scripts by adding it as a custom pipeline component to a spaCy, spacy-stanfordnlp, spacy-stanza, or spacy-udpipe pipeline. It also provides an easy-to-use function to quickly initialize a parser. CoNLL-related properties are added to Doc elements, sentence Spans, and Tokens.",
+            "slogan": "Parsing from and to CoNLL-U format with `spacy`, `spacy-stanza` and `spacy-udpipe`",
+            "description": "This module allows you to parse text into CoNLL-U format or read ConLL-U into a spaCy `Doc`. You can use it as a command line tool, or embed it in your own scripts by adding it as a custom pipeline component to a `spacy`, `spacy-stanza` or `spacy-udpipe` pipeline. It also provides an easy-to-use function to quickly initialize any spaCy-wrapped parser. CoNLL-related properties are added to `Doc` elements, `Span` sentences, and `Token` objects.",
             "code_example": [
                 "from spacy_conll import init_parser",
                 "",
                 "",
                 "# Initialise English parser, already including the ConllFormatter as a pipeline component.",
                 "# Indicate that we want to get the CoNLL headers in the string output.",
-                "# `use_gpu` and `verbose` are specific to stanza (and stanfordnlp). These keywords arguments",
-                "# are passed onto their Pipeline() initialisation",
+                "# `use_gpu` and `verbose` are specific to stanza. These keywords arguments are passed onto their Pipeline() initialisation",
                 "nlp = init_parser(\"en\",",
                 "                  \"stanza\",",
                 "                  parser_opts={\"use_gpu\": True, \"verbose\": False},",
@@ -2435,7 +2434,7 @@
             },
             "github": "BramVanroy/spacy_conll",
             "category": ["standalone", "pipeline"],
-            "tags": ["linguistics", "computational linguistics", "conll"]
+            "tags": ["linguistics", "computational linguistics", "conll", "conll-u"]
         },
         {
             "id": "spacy-langdetect",

From 0d0153db63442ef4523e7e33ac5c863172a61422 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 4 Apr 2022 18:09:15 +0200
Subject: [PATCH 097/123] Update default spans_key to sc in API docs (#10616)

---
 website/docs/api/spancategorizer.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/spancategorizer.md b/website/docs/api/spancategorizer.md
index fc666aaf7..f09ac8bdb 100644
--- a/website/docs/api/spancategorizer.md
+++ b/website/docs/api/spancategorizer.md
@@ -56,7 +56,7 @@ architectures and their arguments and hyperparameters.
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `suggester`    | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~                                                  |
 | `model`        | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
-| `spans_key`    | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~                                                                               |
+| `spans_key`    | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~                                                                               |
 | `threshold`    | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~                                                                                                                                                          |
 | `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~                                                                                                                                                                                      |
 | `scorer`       | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                                                                                                       |
@@ -93,7 +93,7 @@ shortcut for this and instantiate the component using its string name and
 | `suggester`    | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~                                  |
 | `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                  |
 | _keyword-only_ |                                                                                                                                                                                                                                      |
-| `spans_key`    | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~             |
+| `spans_key`    | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~             |
 | `threshold`    | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~                                                                                       |
 | `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~                                                                                                                   |
 

From b91255a454fd91ca9e680143ade21f5570327b44 Mon Sep 17 00:00:00 2001
From: Joachim Fainberg <joachimfainberg@gmail.com>
Date: Tue, 5 Apr 2022 09:08:02 +0200
Subject: [PATCH 098/123] displacy: avoid overlapping arcs in manual mode
 (#10534)

* Added test for overlapping arcs

* Provide distinct levels to overlapping arcs

* Update return type hint for get_levels

* Improved formatting spacy/displacy/render.py

Co-authored-by: Ines Montani <ines@ines.io>

Co-authored-by: Joachim Fainberg <joachimfainberg@Joachims-MacBook-Pro.local>
Co-authored-by: Ines Montani <ines@ines.io>
---
 spacy/displacy/render.py     | 23 +++++++++++++++--------
 spacy/tests/test_displacy.py | 20 ++++++++++++++++++++
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 2925c68a0..8d39e62f5 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 import uuid
 import itertools
 
@@ -270,7 +270,7 @@ class DependencyRenderer:
         RETURNS (str): Rendered SVG markup.
         """
         self.levels = self.get_levels(arcs)
-        self.highest_level = len(self.levels)
+        self.highest_level = max(self.levels.values(), default=0)
         self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
         self.width = self.offset_x + len(words) * self.distance
         self.height = self.offset_y + 3 * self.word_spacing
@@ -330,7 +330,7 @@ class DependencyRenderer:
         if start < 0 or end < 0:
             error_args = dict(start=start, end=end, label=label, dir=direction)
             raise ValueError(Errors.E157.format(**error_args))
-        level = self.levels.index(end - start) + 1
+        level = self.levels[(start, end, label)]
         x_start = self.offset_x + start * self.distance + self.arrow_spacing
         if self.direction == "rtl":
             x_start = self.width - x_start
@@ -346,7 +346,7 @@ class DependencyRenderer:
         y_curve = self.offset_y - level * self.distance / 2
         if self.compact:
             y_curve = self.offset_y - level * self.distance / 6
-        if y_curve == 0 and len(self.levels) > 5:
+        if y_curve == 0 and max(self.levels.values(), default=0) > 5:
             y_curve = -self.distance
         arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
         arc = self.get_arc(x_start, y, y_curve, x_end)
@@ -390,15 +390,22 @@ class DependencyRenderer:
             p1, p2, p3 = (end, end + self.arrow_width - 2, end - self.arrow_width + 2)
         return f"M{p1},{y + 2} L{p2},{y - self.arrow_width} {p3},{y - self.arrow_width}"
 
-    def get_levels(self, arcs: List[Dict[str, Any]]) -> List[int]:
+    def get_levels(self, arcs: List[Dict[str, Any]]) -> Dict[Tuple[int, int, str], int]:
         """Calculate available arc height "levels".
         Used to calculate arrow heights dynamically and without wasting space.
 
         args (list): Individual arcs and their start, end, direction and label.
-        RETURNS (list): Arc levels sorted from lowest to highest.
+        RETURNS (dict): Arc levels keyed by (start, end, label).
         """
-        levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
-        return sorted(list(levels))
+        length = max([arc["end"] for arc in arcs], default=0)
+        max_level = [0] * length
+        levels = {}
+        for arc in sorted(arcs, key=lambda arc: arc["end"] - arc["start"]):
+            level = max(max_level[arc["start"] : arc["end"]]) + 1
+            for i in range(arc["start"], arc["end"]):
+                max_level[i] = level
+            levels[(arc["start"], arc["end"], arc["label"])] = level
+        return levels
 
 
 class EntityRenderer:
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index ccad7e342..95dc47a19 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -8,6 +8,26 @@ from spacy.lang.fa import Persian
 from spacy.tokens import Span, Doc
 
 
+@pytest.mark.issue(5447)
+def test_issue5447():
+    """Test that overlapping arcs get separate levels."""
+    renderer = DependencyRenderer()
+    words = [
+        {"text": "This", "tag": "DT"},
+        {"text": "is", "tag": "VBZ"},
+        {"text": "a", "tag": "DT"},
+        {"text": "sentence.", "tag": "NN"},
+    ]
+    arcs = [
+        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
+        {"start": 2, "end": 3, "label": "det", "dir": "left"},
+        {"start": 2, "end": 3, "label": "overlap", "dir": "left"},
+        {"start": 1, "end": 3, "label": "attr", "dir": "left"},
+    ]
+    html = renderer.render([{"words": words, "arcs": arcs}])
+    assert renderer.highest_level == 3
+
+
 @pytest.mark.issue(2361)
 def test_issue2361(de_vocab):
     """Test if < is escaped when rendering"""

From 02dafa3a8488a23169a572cd15fddb8924b4cacb Mon Sep 17 00:00:00 2001
From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
Date: Thu, 7 Apr 2022 16:48:45 +0800
Subject: [PATCH 099/123] Add debug diff command in spaCy CLI (#10502)

* Add initial design for diff command

For now, the diffing process looks like this:
- The default config is created based from some values in the user
config (e.g. which pipeline components were used, the lang, etc.)
- The user must supply manually if it was optimized for acc/efficiency
and if pretraining was involved.

* Make diff command structure similar to siblings

* Include gpu as a user option for CLI

* Make variables more explicit

* Fix type declaration for optimize enum

* Improve docstrings for diff CLI

* Add debug-diff to website API docs

* Switch position of configs so that user config is modded

* Add markdown flag for debug diff

This commit adds a --markdown (--md) flag that allows easier
copy-pasting to Github issues. Please note that this commit is dependent
on an unreleased version of wasabi (for the time being).

For posterity, the related PR is found here: https://github.com/ines/wasabi/pull/20

* Bump version of wasabi to 0.9.1

So that we can use the add_symbols parameter.

* Apply suggestions from code review

Co-authored-by: Ines Montani <ines@ines.io>

* Update docs based on code review suggestions

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Change command name from diff -> diff-config

* Clarify when options are relevant or not

* Rerun prettier on cli.md

Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 requirements.txt        |   2 +-
 setup.cfg               |   2 +-
 spacy/cli/__init__.py   |   1 +
 spacy/cli/debug_diff.py |  89 ++++++++++++++++
 website/docs/api/cli.md | 229 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 321 insertions(+), 2 deletions(-)
 create mode 100644 spacy/cli/debug_diff.py

diff --git a/requirements.txt b/requirements.txt
index 7b9d343a9..71b6f3279 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ thinc>=8.0.14,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.8.1,<1.1.0
+wasabi>=0.9.1,<1.1.0
 srsly>=2.4.1,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.5.0
diff --git a/setup.cfg b/setup.cfg
index 3c5ba884a..65f7dc528 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -48,7 +48,7 @@ install_requires =
     preshed>=3.0.2,<3.1.0
     thinc>=8.0.14,<8.1.0
     blis>=0.4.0,<0.8.0
-    wasabi>=0.8.1,<1.1.0
+    wasabi>=0.9.1,<1.1.0
     srsly>=2.4.1,<3.0.0
     catalogue>=2.0.6,<2.1.0
     typer>=0.3.0,<0.5.0
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index fd8da262e..ce76ef9a9 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -14,6 +14,7 @@ from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
+from .debug_diff import debug_diff  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
diff --git a/spacy/cli/debug_diff.py b/spacy/cli/debug_diff.py
new file mode 100644
index 000000000..6697c38ae
--- /dev/null
+++ b/spacy/cli/debug_diff.py
@@ -0,0 +1,89 @@
+from typing import Optional
+
+import typer
+from wasabi import Printer, diff_strings, MarkdownRenderer
+from pathlib import Path
+from thinc.api import Config
+
+from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
+from ..util import load_config
+from .init_config import init_config, Optimizations
+
+
+@debug_cli.command(
+    "diff-config",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def debug_diff_cli(
+    # fmt: off
+    ctx: typer.Context,
+    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
+    compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
+    optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
+    gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
+    pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
+    markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
+    # fmt: on
+):
+    """Show a diff of a config file with respect to spaCy's defaults or another config file. If
+    additional settings were used in the creation of the config file, then you
+    must supply these as extra parameters to the command when comparing to the default settings. The generated diff
+    can also be used when posting to the discussion forum to provide more
+    information for the maintainers.
+
+    The `optimize`, `gpu`, and `pretraining` options are only relevant when
+    comparing against the default configuration (or specifically when `compare_to` is None).
+
+    DOCS: https://spacy.io/api/cli#debug-diff
+    """
+    debug_diff(
+        config_path=config_path,
+        compare_to=compare_to,
+        gpu=gpu,
+        optimize=optimize,
+        pretraining=pretraining,
+        markdown=markdown,
+    )
+
+
+def debug_diff(
+    config_path: Path,
+    compare_to: Optional[Path],
+    gpu: bool,
+    optimize: Optimizations,
+    pretraining: bool,
+    markdown: bool,
+):
+    msg = Printer()
+    with show_validation_error(hint_fill=False):
+        user_config = load_config(config_path)
+        if compare_to:
+            other_config = load_config(compare_to)
+        else:
+            # Recreate a default config based from user's config
+            lang = user_config["nlp"]["lang"]
+            pipeline = list(user_config["nlp"]["pipeline"])
+            msg.info(f"Found user-defined language: '{lang}'")
+            msg.info(f"Found user-defined pipelines: {pipeline}")
+            other_config = init_config(
+                lang=lang,
+                pipeline=pipeline,
+                optimize=optimize.value,
+                gpu=gpu,
+                pretraining=pretraining,
+                silent=True,
+            )
+
+    user = user_config.to_str()
+    other = other_config.to_str()
+
+    if user == other:
+        msg.warn("No diff to show: configs are identical")
+    else:
+        diff_text = diff_strings(other, user, add_symbols=markdown)
+        if markdown:
+            md = MarkdownRenderer()
+            md.add(md.code_block(diff_text, "diff"))
+            print(md.text)
+        else:
+            print(diff_text)
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 89e2e87d9..e801ff0a6 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -626,6 +626,235 @@ will not be available.
 | overrides                  | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **PRINTS**                 | Debugging information.                                                                                                                                                                                             |
 
+### debug diff-config {#debug-diff tag="command"}
+
+Show a diff of a config file with respect to spaCy's defaults or another config
+file. If additional settings were used in the creation of the config file, then
+you must supply these as extra parameters to the command when comparing to the
+default settings. The generated diff can also be used when posting to the
+discussion forum to provide more information for the maintainers.
+
+```cli
+$ python -m spacy debug diff-config [config_path] [--compare-to] [--optimize] [--gpu] [--pretraining] [--markdown]
+```
+
+> #### Example
+>
+> ```cli
+> $ python -m spacy debug diff-config ./config.cfg
+> ```
+
+<Accordion title="Example output" spaced>
+
+```
+ℹ Found user-defined language: 'en'
+ℹ Found user-defined pipelines: ['tok2vec', 'tagger', 'parser',
+'ner']
+[paths]
++ train = "./data/train.spacy"
++ dev = "./data/dev.spacy"
+- train = null
+- dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = null
++ seed = 42
+- seed = 0
+
+[nlp]
+lang = "en"
+pipeline = ["tok2vec","tagger","parser","ner"]
+batch_size = 1000
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.ner]
+factory = "ner"
+incorrect_spans_key = null
+moves = null
+scorer = {"@scorers":"spacy.ner_scorer.v1"}
+update_with_oracle_cut_size = 100
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+- hidden_width = 64
++ hidden_width = 36
+maxout_pieces = 2
+use_upper = true
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.parser]
+factory = "parser"
+learn_tokens = false
+min_action_freq = 30
+moves = null
+scorer = {"@scorers":"spacy.parser_scorer.v1"}
+update_with_oracle_cut_size = 100
+
+[components.parser.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "parser"
+extra_state_tokens = false
+hidden_width = 128
+maxout_pieces = 3
+use_upper = true
+nO = null
+
+[components.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.tagger]
+factory = "tagger"
+neg_prefix = "!"
+overwrite = false
+scorer = {"@scorers":"spacy.tagger_scorer.v1"}
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v2"
+width = ${components.tok2vec.model.encode.width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+tag_acc = 0.33
+dep_uas = 0.17
+dep_las = 0.17
+dep_las_per_type = null
+sents_p = null
+sents_r = null
+sents_f = 0.0
+ents_f = 0.33
+ents_p = 0.0
+ents_r = 0.0
+ents_per_type = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
+```
+
+</Accordion>
+
+| Name                 | Description                                                                                                                                                                                                                                                                                               |
+| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `config_path`        | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Union[Path, str] \(positional)~~                                                                                                                                                                  |
+| `compare_to`         | Path to another config file to diff against, or `None` to compare against default settings. ~~Optional[Union[Path, str] \(option)~~                                                                                                                                                                       |
+| `optimize`, `-o`     | `"efficiency"` or `"accuracy"`. Whether the config was optimized for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). Only relevant when comparing against a default config. Defaults to `"efficiency"`. ~~str (option)~~ |
+| `gpu`, `-G`          | Whether the config was made to run on a GPU. Only relevant when comparing against a default config. ~~bool (flag)~~                                                                                                                                                                                       |
+| `pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Only relevant when comparing against a default config. Defaults to `False`. ~~bool (flag)~~                                                                                                                                  |
+| `markdown`, `-md`    | Generate Markdown for Github issues. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                 |
+| **PRINTS**           | Diff between the two config files.                                                                                                                                                                                                                                                                        |
+
 ### debug profile {#debug-profile tag="command"}
 
 Profile which functions take the most time in a spaCy pipeline. Input should be

From 9ba3e1cb2f1f3acbe7654f9904e24d645186acac Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 7 Apr 2022 14:47:37 +0200
Subject: [PATCH 100/123] Basic tests for the Tamil language (#10629)

* Add basic tests for Tamil (ta)

* Add comment
Remove superfluous condition

* Remove superfluous call to `pipe`
Instantiate new tokenizer for special case
---
 spacy/tests/conftest.py               |   5 +
 spacy/tests/lang/ta/__init__.py       |   0
 spacy/tests/lang/ta/test_text.py      |  25 ++++
 spacy/tests/lang/ta/test_tokenizer.py | 188 ++++++++++++++++++++++++++
 4 files changed, 218 insertions(+)
 create mode 100644 spacy/tests/lang/ta/__init__.py
 create mode 100644 spacy/tests/lang/ta/test_text.py
 create mode 100644 spacy/tests/lang/ta/test_tokenizer.py

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 24474c71e..db17f1a8f 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -357,6 +357,11 @@ def sv_tokenizer():
     return get_lang_class("sv")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def ta_tokenizer():
+    return get_lang_class("ta")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def th_tokenizer():
     pytest.importorskip("pythainlp")
diff --git a/spacy/tests/lang/ta/__init__.py b/spacy/tests/lang/ta/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/ta/test_text.py b/spacy/tests/lang/ta/test_text.py
new file mode 100644
index 000000000..228a14c18
--- /dev/null
+++ b/spacy/tests/lang/ta/test_text.py
@@ -0,0 +1,25 @@
+import pytest
+from spacy.lang.ta import Tamil
+
+# Wikipedia excerpt: https://en.wikipedia.org/wiki/Chennai (Tamil Language)
+TAMIL_BASIC_TOKENIZER_SENTENCIZER_TEST_TEXT = """சென்னை (Chennai) தமிழ்நாட்டின் தலைநகரமும், இந்தியாவின் நான்காவது பெரிய நகரமும் ஆகும். 1996 ஆம் ஆண்டுக்கு முன்னர் இந்நகரம், மதராசு பட்டினம், மெட்ராஸ் (Madras) மற்றும் சென்னப்பட்டினம் என்றும் அழைக்கப்பட்டு வந்தது. சென்னை, வங்காள விரிகுடாவின் கரையில் அமைந்த துறைமுக நகரங்களுள் ஒன்று. சுமார் 10 மில்லியன் (ஒரு கோடி) மக்கள் வாழும் இந்நகரம், உலகின் 35 பெரிய மாநகரங்களுள் ஒன்று. 17ஆம் நூற்றாண்டில் ஆங்கிலேயர் சென்னையில் கால் பதித்தது முதல், சென்னை நகரம் ஒரு முக்கிய நகரமாக வளர்ந்து வந்திருக்கிறது. சென்னை தென்னிந்தியாவின் வாசலாகக் கருதப்படுகிறது. சென்னை நகரில் உள்ள மெரினா கடற்கரை உலகின் நீளமான கடற்கரைகளுள் ஒன்று. சென்னை கோலிவுட் (Kollywood) என அறியப்படும் தமிழ்த் திரைப்படத் துறையின் தாயகம் ஆகும். பல விளையாட்டு அரங்கங்கள் உள்ள சென்னையில் பல விளையாட்டுப் போட்டிகளும் நடைபெறுகின்றன."""
+
+
+@pytest.mark.parametrize(
+    "text, num_tokens",
+    [(TAMIL_BASIC_TOKENIZER_SENTENCIZER_TEST_TEXT, 23 + 90)],  # Punctuation + rest
+)
+def test_long_text(ta_tokenizer, text, num_tokens):
+    tokens = ta_tokenizer(text)
+    assert len(tokens) == num_tokens
+
+
+@pytest.mark.parametrize(
+    "text, num_sents", [(TAMIL_BASIC_TOKENIZER_SENTENCIZER_TEST_TEXT, 9)]
+)
+def test_ta_sentencizer(text, num_sents):
+    nlp = Tamil()
+    nlp.add_pipe("sentencizer")
+
+    doc = nlp(text)
+    assert len(list(doc.sents)) == num_sents
diff --git a/spacy/tests/lang/ta/test_tokenizer.py b/spacy/tests/lang/ta/test_tokenizer.py
new file mode 100644
index 000000000..6ba8a2400
--- /dev/null
+++ b/spacy/tests/lang/ta/test_tokenizer.py
@@ -0,0 +1,188 @@
+import pytest
+from spacy.symbols import ORTH
+from spacy.lang.ta import Tamil
+
+TA_BASIC_TOKENIZATION_TESTS = [
+    (
+        "கிறிஸ்துமஸ் மற்றும் இனிய புத்தாண்டு வாழ்த்துக்கள்",
+        ["கிறிஸ்துமஸ்", "மற்றும்", "இனிய", "புத்தாண்டு", "வாழ்த்துக்கள்"],
+    ),
+    (
+        "எனக்கு என் குழந்தைப் பருவம் நினைவிருக்கிறது",
+        ["எனக்கு", "என்", "குழந்தைப்", "பருவம்", "நினைவிருக்கிறது"],
+    ),
+    ("உங்கள் பெயர் என்ன?", ["உங்கள்", "பெயர்", "என்ன", "?"]),
+    (
+        "ஏறத்தாழ இலங்கைத் தமிழரில் மூன்றிலொரு பங்கினர் இலங்கையை விட்டு வெளியேறிப் பிற நாடுகளில் வாழ்கின்றனர்",
+        [
+            "ஏறத்தாழ",
+            "இலங்கைத்",
+            "தமிழரில்",
+            "மூன்றிலொரு",
+            "பங்கினர்",
+            "இலங்கையை",
+            "விட்டு",
+            "வெளியேறிப்",
+            "பிற",
+            "நாடுகளில்",
+            "வாழ்கின்றனர்",
+        ],
+    ),
+    (
+        "இந்த ஃபோனுடன் சுமார் ரூ.2,990 மதிப்புள்ள போட் ராக்கர்ஸ் நிறுவனத்தின் ஸ்போர்ட் புளூடூத் ஹெட்போன்ஸ் இலவசமாக வழங்கப்படவுள்ளது.",
+        [
+            "இந்த",
+            "ஃபோனுடன்",
+            "சுமார்",
+            "ரூ.2,990",
+            "மதிப்புள்ள",
+            "போட்",
+            "ராக்கர்ஸ்",
+            "நிறுவனத்தின்",
+            "ஸ்போர்ட்",
+            "புளூடூத்",
+            "ஹெட்போன்ஸ்",
+            "இலவசமாக",
+            "வழங்கப்படவுள்ளது",
+            ".",
+        ],
+    ),
+    (
+        "மட்டக்களப்பில் பல இடங்களில் வீட்டுத் திட்டங்களுக்கு இன்று அடிக்கல் நாட்டல்",
+        [
+            "மட்டக்களப்பில்",
+            "பல",
+            "இடங்களில்",
+            "வீட்டுத்",
+            "திட்டங்களுக்கு",
+            "இன்று",
+            "அடிக்கல்",
+            "நாட்டல்",
+        ],
+    ),
+    (
+        "ஐ போன்க்கு முகத்தை வைத்து அன்லாக் செய்யும் முறை மற்றும் விரலால் தொட்டு அன்லாக் செய்யும் முறையை வாட்ஸ் ஆப் நிறுவனம் இதற்கு முன் கண்டுபிடித்தது",
+        [
+            "ஐ",
+            "போன்க்கு",
+            "முகத்தை",
+            "வைத்து",
+            "அன்லாக்",
+            "செய்யும்",
+            "முறை",
+            "மற்றும்",
+            "விரலால்",
+            "தொட்டு",
+            "அன்லாக்",
+            "செய்யும்",
+            "முறையை",
+            "வாட்ஸ்",
+            "ஆப்",
+            "நிறுவனம்",
+            "இதற்கு",
+            "முன்",
+            "கண்டுபிடித்தது",
+        ],
+    ),
+    (
+        "இது ஒரு வாக்கியம்.",
+        [
+            "இது",
+            "ஒரு",
+            "வாக்கியம்",
+            ".",
+        ],
+    ),
+    (
+        "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன",
+        [
+            "தன்னாட்சி",
+            "கார்கள்",
+            "காப்பீட்டு",
+            "பொறுப்பை",
+            "உற்பத்தியாளரிடம்",
+            "மாற்றுகின்றன",
+        ],
+    ),
+    (
+        "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது",
+        [
+            "நடைபாதை",
+            "விநியோக",
+            "ரோபோக்களை",
+            "தடை",
+            "செய்வதை",
+            "சான்",
+            "பிரான்சிஸ்கோ",
+            "கருதுகிறது",
+        ],
+    ),
+    (
+        "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்.",
+        [
+            "லண்டன்",
+            "ஐக்கிய",
+            "இராச்சியத்தில்",
+            "ஒரு",
+            "பெரிய",
+            "நகரம்",
+            ".",
+        ],
+    ),
+    (
+        "என்ன வேலை செய்கிறீர்கள்?",
+        [
+            "என்ன",
+            "வேலை",
+            "செய்கிறீர்கள்",
+            "?",
+        ],
+    ),
+    (
+        "எந்த கல்லூரியில் படிக்கிறாய்?",
+        [
+            "எந்த",
+            "கல்லூரியில்",
+            "படிக்கிறாய்",
+            "?",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", TA_BASIC_TOKENIZATION_TESTS)
+def test_ta_tokenizer_basic(ta_tokenizer, text, expected_tokens):
+    tokens = ta_tokenizer(text)
+    token_list = [token.text for token in tokens]
+    assert expected_tokens == token_list
+
+
+@pytest.mark.parametrize(
+    "text,expected_tokens",
+    [
+        (
+            "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது",
+            [
+                "ஆப்பிள்",
+                "நிறுவனம்",
+                "யு.கே.",
+                "தொடக்க",
+                "நிறுவனத்தை",
+                "ஒரு",
+                "லட்சம்",
+                "கோடிக்கு",
+                "வாங்கப்",
+                "பார்க்கிறது",
+            ],
+        )
+    ],
+)
+def test_ta_tokenizer_special_case(text, expected_tokens):
+    # Add a special rule to tokenize the initialism "யு.கே." (U.K., as
+    # in the country) as a single token.
+    nlp = Tamil()
+    nlp.tokenizer.add_special_case("யு.கே.", [{ORTH: "யு.கே."}])
+    tokens = nlp(text)
+
+    token_list = [token.text for token in tokens]
+    assert expected_tokens == token_list

From d4196a62f198f0ec32239b238f32421bbb6eb942 Mon Sep 17 00:00:00 2001
From: David Berenstein <david.m.berenstein@gmail.com>
Date: Fri, 8 Apr 2022 08:23:58 +0200
Subject: [PATCH 101/123] added crosslingual coreference to spacy universe
 without additional commits (#10580)

* added crosslingual coreference to spacy universe

* Updated example to introduce batching example.

Co-authored-by: David Berenstein <david.berenstein@pandoraintelligence.com>
---
 website/meta/universe.json | 60 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 5665f6518..ba566fd71 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2704,6 +2704,66 @@
             ],
             "spacy_version": 3
         },
+        {
+            "id": "crosslingualcoreference",
+            "title": "Crosslingual Coreference",
+            "slogan": "One multi-lingual coreference model to rule them all!",
+            "description": "Coreference is amazing but the data required for training a model is very scarce. In our case, the available training for non-English languages also data proved to be poorly annotated. Crosslingual Coreference therefore uses the assumption a trained model with English data and cross-lingual embeddings should work for other languages with similar sentence structure. Verified to work quite well for at least (EN, NL, DK, FR, DE).",
+            "github": "pandora-intelligence/crosslingual-coreference",
+            "pip": "crosslingual-coreference",
+            "thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png",
+            "image": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/example_total.png",
+            "code_example": [
+                "import spacy",
+                "import crosslingual_coreference",
+                "",
+                "text = \"\"\"",
+                "    Do not forget about Momofuku Ando!",
+                "    He created instant noodles in Osaka.",
+                "    At that location, Nissin was founded.",
+                "    Many students survived by eating these noodles, but they don't even know him.\"\"\"",
+                "",
+                "# use any model that has internal spacy embeddings", 
+                "nlp = spacy.load('en_core_web_sm')",
+                "nlp.add_pipe(",
+                "    \"xx_coref\", config={\"chunk_size\": 2500, \"chunk_overlap\": 2, \"device\": 0})",
+                ")",
+                "",
+                "doc = nlp(text)",
+                "",
+                "print(doc._.coref_clusters)",
+                "# Output",
+                "#",
+                "# [[[4, 5], [7, 7], [27, 27], [36, 36]],",
+                "# [[12, 12], [15, 16]],",
+                "# [[9, 10], [27, 28]],",
+                "# [[22, 23], [31, 31]]]",
+                "print(doc._.resolved_text)",
+                "# Output",
+                "#",
+                "# Do not forget about Momofuku Ando!",
+                "# Momofuku Ando created instant noodles in Osaka.",
+                "# At Osaka, Nissin was founded.",
+                "# Many students survived by eating instant noodles,",
+                "# but Many students don't even know Momofuku Ando."
+            ],
+            "author": "David Berenstein",
+            "author_links": {
+                "github": "davidberenstein1957",
+                "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
+            },
+            "category": [
+                "pipeline",
+                "standalone"
+            ],
+            "tags": [
+                "coreference",
+                "multi-lingual",
+                "cross-lingual",
+                "allennlp"
+            ],
+            "spacy_version": 3
+        },
         {
             "id": "blackstone",
             "title": "Blackstone",

From 75fbbcdc18a0140a598138485506f8101eaa7f8f Mon Sep 17 00:00:00 2001
From: Richard Hudson <richard@explosion.ai>
Date: Tue, 12 Apr 2022 10:48:28 +0200
Subject: [PATCH 102/123] Display warning when spacy.explain() finds no term
 (#10645)

* Display warning when spacy.explain() finds no term

* Updated warning message text
---
 spacy/errors.py   | 3 +++
 spacy/glossary.py | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index 84eca8016..abf710d7c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -196,6 +196,9 @@ class Warnings(metaclass=ErrorsWithCodes):
             "surprising to you, make sure the Doc was processed using a model "
             "that supports span categorization, and check the `doc.spans[spans_key]` "
             "property manually if necessary.")
+    W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
+            "for the corpora used to train the language. Please check "
+            "`nlp.meta[\"sources\"]` for any relevant links.")
 
 
 class Errors(metaclass=ErrorsWithCodes):
diff --git a/spacy/glossary.py b/spacy/glossary.py
index 57254330f..25c00d3ed 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -1,3 +1,7 @@
+import warnings
+from .errors import Warnings
+
+
 def explain(term):
     """Get a description for a given POS tag, dependency label or entity type.
 
@@ -11,6 +15,8 @@ def explain(term):
     """
     if term in GLOSSARY:
         return GLOSSARY[term]
+    else:
+        warnings.warn(Warnings.W118.format(term=term))
 
 
 GLOSSARY = {

From 64602d997d1568bfeb9016af4a21cee955929da9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 13 Apr 2022 11:41:40 +0200
Subject: [PATCH 103/123] Require srsly v2.4.3+ due to buffer overflow
 vulnerability (#10651)

---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 71b6f3279..619d35ebc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.1.0
-srsly>=2.4.1,<3.0.0
+srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.5.0
 pathy>=0.3.5
diff --git a/setup.cfg b/setup.cfg
index 65f7dc528..2626de87e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -49,7 +49,7 @@ install_requires =
     thinc>=8.0.14,<8.1.0
     blis>=0.4.0,<0.8.0
     wasabi>=0.9.1,<1.1.0
-    srsly>=2.4.1,<3.0.0
+    srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     typer>=0.3.0,<0.5.0
     pathy>=0.3.5

From 4228f3c7575ae2d3a2c6046149a3ee7c5c6ce027 Mon Sep 17 00:00:00 2001
From: single-fingal <97484373+single-fingal@users.noreply.github.com>
Date: Thu, 14 Apr 2022 00:59:48 -0700
Subject: [PATCH 104/123] Fix a few minor bugs in the SpanGroup API web docs
 (#10650)

* Fix a few minor bugs in the SpanGroup API web docs

* Update SpanGroup docs examples to have Spans reflect intended "errors"
---
 spacy/tokens/span_group.pyx   |  4 ++--
 website/docs/api/doc.md       |  2 +-
 website/docs/api/spangroup.md | 34 +++++++++++++++++-----------------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 1c09f4ea2..bb0fab24f 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -21,13 +21,13 @@ cdef class SpanGroup:
         >>> doc.spans["errors"] = SpanGroup(
             doc,
             name="errors",
-            spans=[doc[0:1], doc[2:4]],
+            spans=[doc[0:1], doc[1:3]],
             attrs={"annotator": "matt"}
         )
 
         Construction 2
         >>> doc = nlp("Their goi ng home")
-        >>> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+        >>> doc.spans["errors"] = [doc[0:1], doc[1:3]]
         >>> assert isinstance(doc.spans["errors"], SpanGroup)
 
     DOCS: https://spacy.io/api/spangroup
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index c21328caf..c28509ab0 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -585,7 +585,7 @@ objects or a [`SpanGroup`](/api/spangroup) to a given key.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 > ```
 
 | Name        | Description                                                        |
diff --git a/website/docs/api/spangroup.md b/website/docs/api/spangroup.md
index 337e61749..1e2d18a82 100644
--- a/website/docs/api/spangroup.md
+++ b/website/docs/api/spangroup.md
@@ -21,7 +21,7 @@ Create a `SpanGroup`.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> spans = [doc[0:1], doc[2:4]]
+> spans = [doc[0:1], doc[1:3]]
 >
 > # Construction 1
 > from spacy.tokens import SpanGroup
@@ -60,7 +60,7 @@ the scope of your function.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 > assert doc.spans["errors"].doc == doc
 > ```
 
@@ -76,9 +76,9 @@ Check whether the span group contains overlapping spans.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 > assert not doc.spans["errors"].has_overlap
-> doc.spans["errors"].append(doc[1:2])
+> doc.spans["errors"].append(doc[2:4])
 > assert doc.spans["errors"].has_overlap
 > ```
 
@@ -94,7 +94,7 @@ Get the number of spans in the group.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 > assert len(doc.spans["errors"]) == 2
 > ```
 
@@ -113,11 +113,11 @@ changes to be reflected in the span group.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 > span = doc.spans["errors"][1]
 > assert span.text == "goi ng"
 > span.label_ = 'LABEL'
-> assert doc.spans["errors"][1] != 'LABEL' # The span within the group was not updated
+> assert doc.spans["errors"][1].label_ != 'LABEL' # The span within the group was not updated
 > ```
 
 | Name        | Description                           |
@@ -133,7 +133,7 @@ Set a span in the span group.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 > span = doc[0:2]
 > doc.spans["errors"][0] = span
 > assert doc.spans["errors"][0].text == "Their goi"
@@ -152,7 +152,7 @@ Delete a span from the span group.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 > del doc.spans[0]
 > assert len(doc.spans["errors"]) == 1
 > ```
@@ -171,8 +171,8 @@ over `attrs` in the second.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
-> doc.spans["other"] = [doc[0:2], doc[1:3]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
+> doc.spans["other"] = [doc[0:2], doc[2:4]]
 > span_group = doc.spans["errors"] + doc.spans["other"]
 > assert len(span_group) == 4
 > ```
@@ -192,7 +192,7 @@ already present in the current span group.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 > doc.spans["errors"] += [doc[3:4], doc[2:3]]
 > assert len(doc.spans["errors"]) == 4
 > ```
@@ -212,7 +212,7 @@ Add a [`Span`](/api/span) object to the group. The span must refer to the same
 > ```python
 > doc = nlp("Their goi ng home")
 > doc.spans["errors"] = [doc[0:1]]
-> doc.spans["errors"].append(doc[2:4])
+> doc.spans["errors"].append(doc[1:3])
 > assert len(doc.spans["errors"]) == 2
 > ```
 
@@ -231,7 +231,7 @@ group.
 > ```python
 > doc = nlp("Their goi ng home")
 > doc.spans["errors"] = []
-> doc.spans["errors"].extend([doc[2:4], doc[0:1]])
+> doc.spans["errors"].extend([doc[1:3], doc[0:1]])
 > assert len(doc.spans["errors"]) == 2
 > span_group = SpanGroup([doc[1:4], doc[0:3])
 > doc.spans["errors"].extend(span_group)
@@ -251,7 +251,7 @@ Return a copy of the span group.
 > from spacy.tokens import SpanGroup
 >
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[2:4], doc[0:3]]
+> doc.spans["errors"] = [doc[1:3], doc[0:3]]
 > new_group = doc.spans["errors"].copy()
 > ```
 
@@ -267,7 +267,7 @@ Serialize the span group to a bytestring.
 >
 > ```python
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 > group_bytes = doc.spans["errors"].to_bytes()
 > ```
 
@@ -286,7 +286,7 @@ it.
 > from spacy.tokens import SpanGroup
 >
 > doc = nlp("Their goi ng home")
-> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
 > group_bytes = doc.spans["errors"].to_bytes()
 > new_group = SpanGroup()
 > new_group.from_bytes(group_bytes)

From caf8528af79dd062f30bdcb54ee4be561063c995 Mon Sep 17 00:00:00 2001
From: Schero1994 <54189963+Schero1994@users.noreply.github.com>
Date: Thu, 14 Apr 2022 10:08:19 +0200
Subject: [PATCH 105/123] Batch #1 | spaCy universe cleanup (#10642)

* delete universe object: wmd-relax

* delete universe object: spaCy.jl

* delete universe object: saber

* delete universe object: languagecrunch

* delete universe object: gracyql

* delete universe object: ExcelCy

* delete universe object: EpiTator

Co-authored-by: schaeran <schaeran1994@gmail.com>
---
 website/meta/universe.json | 162 -------------------------------------
 1 file changed, 162 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index ba566fd71..7a0b43b77 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -591,23 +591,6 @@
             "category": ["conversational", "standalone"],
             "tags": ["chatbots"]
         },
-        {
-            "id": "saber",
-            "title": "saber",
-            "slogan": "Deep-learning based tool for information extraction in the biomedical domain",
-            "github": "BaderLab/saber",
-            "pip": "saber",
-            "thumb": "https://raw.githubusercontent.com/BaderLab/saber/master/docs/img/saber_logo.png",
-            "code_example": [
-                "from saber.saber import Saber",
-                "saber = Saber()",
-                "saber.load('PRGE')",
-                "saber.annotate('The phosphorylation of Hdm2 by MK2 promotes the ubiquitination of p53.')"
-            ],
-            "author": "Bader Lab, University of Toronto",
-            "category": ["scientific"],
-            "tags": ["keras", "biomedical"]
-        },
         {
             "id": "alibi",
             "title": "alibi",
@@ -1076,29 +1059,6 @@
             "category": ["pipeline"],
             "tags": ["pipeline", "readability", "syntactic complexity", "descriptive statistics"]
         },
-        {
-            "id": "wmd-relax",
-            "slogan": "Calculates word mover's distance insanely fast",
-            "description": "Calculates Word Mover's Distance as described in [From Word Embeddings To Document Distances](http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf) by Matt Kusner, Yu Sun, Nicholas Kolkin and Kilian Weinberger.\n\n⚠️ **This package is currently only compatible with spaCy v.1x.**",
-            "github": "src-d/wmd-relax",
-            "thumb": "https://i.imgur.com/f91C3Lf.jpg",
-            "code_example": [
-                "import spacy",
-                "import wmd",
-                "",
-                "nlp = spacy.load('en', create_pipeline=wmd.WMD.create_spacy_pipeline)",
-                "doc1 = nlp(\"Politician speaks to the media in Illinois.\")",
-                "doc2 = nlp(\"The president greets the press in Chicago.\")",
-                "print(doc1.similarity(doc2))"
-            ],
-            "author": "source{d}",
-            "author_links": {
-                "github": "src-d",
-                "twitter": "sourcedtech",
-                "website": "https://sourced.tech"
-            },
-            "category": ["pipeline"]
-        },
         {
             "id": "neuralcoref",
             "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy",
@@ -1525,17 +1485,6 @@
             },
             "category": ["nonpython"]
         },
-        {
-            "id": "spaCy.jl",
-            "slogan": "Julia interface for spaCy (work in progress)",
-            "github": "jekbradbury/SpaCy.jl",
-            "author": "James Bradbury",
-            "author_links": {
-                "github": "jekbradbury",
-                "twitter": "jekbradbury"
-            },
-            "category": ["nonpython"]
-        },
         {
             "id": "ruby-spacy",
             "title": "ruby-spacy",
@@ -1605,21 +1554,6 @@
             },
             "category": ["apis"]
         },
-        {
-            "id": "languagecrunch",
-            "slogan": "NLP server for spaCy, WordNet and NeuralCoref as a Docker image",
-            "github": "artpar/languagecrunch",
-            "code_example": [
-                "docker run -it -p 8080:8080 artpar/languagecrunch",
-                "curl http://localhost:8080/nlp/parse?`echo -n \"The new twitter is so weird. Seriously. Why is there a new twitter? What was wrong with the old one? Fix it now.\" | python -c \"import urllib, sys; print(urllib.urlencode({'sentence': sys.stdin.read()}))\"`"
-            ],
-            "code_language": "bash",
-            "author": "Parth Mudgal",
-            "author_links": {
-                "github": "artpar"
-            },
-            "category": ["apis"]
-        },
         {
             "id": "spacy-nlp",
             "slogan": " Expose spaCy NLP text parsing to Node.js (and other languages) via Socket.IO",
@@ -2194,43 +2128,6 @@
             "category": ["standalone"],
             "tags": ["question-answering", "elasticsearch"]
         },
-        {
-            "id": "epitator",
-            "title": "EpiTator",
-            "thumb": "https://i.imgur.com/NYFY1Km.jpg",
-            "slogan": "Extracts case counts, resolved location/species/disease names, date ranges and more",
-            "description": "EcoHealth Alliance uses EpiTator to catalog the what, where and when of infectious disease case counts reported in online news. Each of these aspects is extracted using independent annotators than can be applied to other domains. EpiTator organizes annotations by creating \"AnnoTiers\" for each type. AnnoTiers have methods for manipulating, combining and searching annotations. For instance, the `with_following_spans_from()` method can be used to create a new tier that combines a tier of one type (such as numbers), with another (say, kitchenware). The resulting tier will contain all the phrases in the document that match that pattern, like \"5 plates\" or \"2 cups.\"\n\nAnother commonly used method is `group_spans_by_containing_span()` which can be used to do things like find all the spaCy tokens in all the GeoNames a document mentions. spaCy tokens, named entities, sentences and noun chunks are exposed through the spaCy annotator which will create a AnnoTier for each. These are basis of many of the other annotators. EpiTator also includes an annotator for extracting tables embedded in free text articles. Another neat feature is that the lexicons used for entity resolution are all stored in an embedded sqlite database so there is no need to run any external services in order to use EpiTator.",
-            "url": "https://github.com/ecohealthalliance/EpiTator",
-            "github": "ecohealthalliance/EpiTator",
-            "pip": "EpiTator",
-            "code_example": [
-                "from epitator.annotator import AnnoDoc",
-                "from epitator.geoname_annotator import GeonameAnnotator",
-                "",
-                "doc = AnnoDoc('Where is Chiang Mai?')",
-                "geoname_annotier = doc.require_tiers('geonames', via=GeonameAnnotator)",
-                "geoname = geoname_annotier.spans[0].metadata['geoname']",
-                "geoname['name']",
-                "# = 'Chiang Mai'",
-                "geoname['geonameid']",
-                "# = '1153671'",
-                "geoname['latitude']",
-                "# = 18.79038",
-                "geoname['longitude']",
-                "# = 98.98468",
-                "",
-                "from epitator.spacy_annotator import SpacyAnnotator",
-                "spacy_token_tier = doc.require_tiers('spacy.tokens', via=SpacyAnnotator)",
-                "list(geoname_annotier.group_spans_by_containing_span(spacy_token_tier))",
-                "# = [(AnnoSpan(9-19, Chiang Mai), [AnnoSpan(9-15, Chiang), AnnoSpan(16-19, Mai)])]"
-            ],
-            "author": "EcoHealth Alliance",
-            "author_links": {
-                "github": "ecohealthalliance",
-                "website": " https://ecohealthalliance.org/"
-            },
-            "category": ["scientific", "standalone"]
-        },
         {
             "id": "self-attentive-parser",
             "title": "Berkeley Neural Parser",
@@ -2259,30 +2156,6 @@
             },
             "category": ["research", "pipeline"]
         },
-        {
-            "id": "excelcy",
-            "title": "ExcelCy",
-            "slogan": "Excel Integration with spaCy. Training NER using XLSX from PDF, DOCX, PPT, PNG or JPG.",
-            "description": "ExcelCy is a toolkit to integrate Excel to spaCy NLP training experiences. Training NER using XLSX from PDF, DOCX, PPT, PNG or JPG. ExcelCy has pipeline to match Entity with PhraseMatcher or Matcher in regular expression.",
-            "url": "https://github.com/kororo/excelcy",
-            "github": "kororo/excelcy",
-            "pip": "excelcy",
-            "code_example": [
-                "from excelcy import ExcelCy",
-                "# collect sentences, annotate Entities and train NER using spaCy",
-                "excelcy = ExcelCy.execute(file_path='https://github.com/kororo/excelcy/raw/master/tests/data/test_data_01.xlsx')",
-                "# use the nlp object as per spaCy API",
-                "doc = excelcy.nlp('Google rebrands its business apps')",
-                "# or save it for faster bootstrap for application",
-                "excelcy.nlp.to_disk('/model')"
-            ],
-            "author": "Robertus Johansyah",
-            "author_links": {
-                "github": "kororo"
-            },
-            "category": ["training"],
-            "tags": ["excel"]
-        },
         {
             "id": "spacy-graphql",
             "title": "spacy-graphql",
@@ -2496,41 +2369,6 @@
             },
             "category": ["standalone", "conversational"]
         },
-        {
-            "id": "gracyql",
-            "title": "gracyql",
-            "slogan": "A thin GraphQL wrapper around spacy",
-            "github": "oterrier/gracyql",
-            "description": "An example of a basic [Starlette](https://github.com/encode/starlette) app using [Spacy](https://github.com/explosion/spaCy) and [Graphene](https://github.com/graphql-python/graphene). The main goal is to be able to use the amazing power of spaCy from other languages and retrieving only the information you need thanks to the GraphQL query definition. The GraphQL schema tries to mimic as much as possible the original Spacy API with classes Doc, Span and Token.",
-            "thumb": "https://i.imgur.com/xC7zpTO.png",
-            "category": ["apis"],
-            "tags": ["graphql"],
-            "code_example": [
-                "query ParserDisabledQuery {",
-                "  nlp(model: \"en\", disable: [\"parser\", \"ner\"]) {",
-                "    doc(text: \"I live in Grenoble, France\") {",
-                "      text",
-                "      tokens {",
-                "        id",
-                "        pos",
-                "        lemma",
-                "        dep",
-                "      }",
-                "      ents {",
-                "        start",
-                "        end",
-                "        label",
-                "      }",
-                "    }",
-                "  }",
-                "}"
-            ],
-            "code_language": "json",
-            "author": "Olivier Terrier",
-            "author_links": {
-                "github": "oterrier"
-            }
-        },
         {
             "id": "pyInflect",
             "slogan": "A Python module for word inflections",

From 028cbad05ef215e124ebe4c71ec5ad62fd038ba7 Mon Sep 17 00:00:00 2001
From: fonfonx <xavier.fontaine@polytechnique.org>
Date: Thu, 14 Apr 2022 10:21:27 +0200
Subject: [PATCH 106/123] Add feminine form of word "one" in French (#10653)

* Add French number

* Add fonfonx.md

* Add feminine ordinal words for French
---
 .github/contributors/fonfonx.md | 106 ++++++++++++++++++++++++++++++++
 spacy/lang/fr/lex_attrs.py      |   4 +-
 2 files changed, 108 insertions(+), 2 deletions(-)
 create mode 100644 .github/contributors/fonfonx.md

diff --git a/.github/contributors/fonfonx.md b/.github/contributors/fonfonx.md
new file mode 100644
index 000000000..7fb01ca5a
--- /dev/null
+++ b/.github/contributors/fonfonx.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Xavier Fontaine      |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2022-04-13           |
+| GitHub username                | fonfonx              |
+| Website (optional)             |                      |
diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py
index da98c6e37..811312ad7 100644
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@@ -3,7 +3,7 @@ from ...attrs import LIKE_NUM
 
 _num_words = set(
     """
-zero un deux trois quatre cinq six sept huit neuf dix
+zero un une deux trois quatre cinq six sept huit neuf dix
 onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
 vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante
 cent mille mil million milliard billion quadrillion quintillion
@@ -13,7 +13,7 @@ sextillion septillion octillion nonillion decillion
 
 _ordinal_words = set(
     """
-premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
+premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième
 onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième
 vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième
 centième millième millionnième milliardième billionnième quadrillionnième quintillionnième

From e63a5d4888897bcf18f918211013dc6c88330983 Mon Sep 17 00:00:00 2001
From: Philip Vollet <philip@explosion.ai>
Date: Thu, 14 Apr 2022 13:34:01 +0200
Subject: [PATCH 107/123] Update newsletter id (#10655)

---
 website/meta/site.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/site.json b/website/meta/site.json
index 9ecaef74c..97051011f 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -19,7 +19,7 @@
     "newsletter": {
         "user": "spacy.us12",
         "id": "83b0498b1e7fa3c91ce68c3f1",
-        "list": "89ad33e698"
+        "list": "ecc82e0493"
     },
     "docSearch": {
         "appId": "Y1LB128RON",

From 4e1716223c99402efc97698116905c04d39ef6be Mon Sep 17 00:00:00 2001
From: Joachim Fainberg <joachimfainberg@gmail.com>
Date: Thu, 14 Apr 2022 16:48:00 +0200
Subject: [PATCH 108/123] displaCy: Avoid increasing levels for identical arcs
 (#10639)

* Test for arc levels for identical arcs

Also moves the test in order with the other numbered tests.

* displaCy: filter identical arcs

Avoid increased levels due to identical arcs by first
filtering any identical arcs.

* Sort keys before filtering

Manual entry with keys out of order would previously become
different tuples and therefore not filtered correctly.

Co-authored-by: Joachim Fainberg <joachimfainberg@Joachims-MBP.lan>
---
 spacy/displacy/render.py     |  1 +
 spacy/tests/test_displacy.py | 41 ++++++++++++++++++------------------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 8d39e62f5..247ad996b 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -397,6 +397,7 @@ class DependencyRenderer:
         args (list): Individual arcs and their start, end, direction and label.
         RETURNS (dict): Arc levels keyed by (start, end, label).
         """
+        arcs = [dict(t) for t in {tuple(sorted(arc.items())) for arc in arcs}]
         length = max([arc["end"] for arc in arcs], default=0)
         max_level = [0] * length
         levels = {}
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index 95dc47a19..f52c36889 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -8,26 +8,6 @@ from spacy.lang.fa import Persian
 from spacy.tokens import Span, Doc
 
 
-@pytest.mark.issue(5447)
-def test_issue5447():
-    """Test that overlapping arcs get separate levels."""
-    renderer = DependencyRenderer()
-    words = [
-        {"text": "This", "tag": "DT"},
-        {"text": "is", "tag": "VBZ"},
-        {"text": "a", "tag": "DT"},
-        {"text": "sentence.", "tag": "NN"},
-    ]
-    arcs = [
-        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
-        {"start": 2, "end": 3, "label": "det", "dir": "left"},
-        {"start": 2, "end": 3, "label": "overlap", "dir": "left"},
-        {"start": 1, "end": 3, "label": "attr", "dir": "left"},
-    ]
-    html = renderer.render([{"words": words, "arcs": arcs}])
-    assert renderer.highest_level == 3
-
-
 @pytest.mark.issue(2361)
 def test_issue2361(de_vocab):
     """Test if < is escaped when rendering"""
@@ -103,6 +83,27 @@ def test_issue3882(en_vocab):
     displacy.parse_deps(doc)
 
 
+@pytest.mark.issue(5447)
+def test_issue5447():
+    """Test that overlapping arcs get separate levels, unless they're identical."""
+    renderer = DependencyRenderer()
+    words = [
+        {"text": "This", "tag": "DT"},
+        {"text": "is", "tag": "VBZ"},
+        {"text": "a", "tag": "DT"},
+        {"text": "sentence.", "tag": "NN"},
+    ]
+    arcs = [
+        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
+        {"start": 2, "end": 3, "label": "det", "dir": "left"},
+        {"start": 2, "end": 3, "label": "overlap", "dir": "left"},
+        {"end": 3, "label": "overlap", "start": 2, "dir": "left"},
+        {"start": 1, "end": 3, "label": "attr", "dir": "left"},
+    ]
+    renderer.render([{"words": words, "arcs": arcs}])
+    assert renderer.highest_level == 3
+
+
 @pytest.mark.issue(5838)
 def test_issue5838():
     # Displacy's EntityRenderer break line

From d622883a427f7837ce8d3ea5116bbbf1acb7d675 Mon Sep 17 00:00:00 2001
From: Schero1994 <54189963+Schero1994@users.noreply.github.com>
Date: Fri, 15 Apr 2022 15:36:54 +0200
Subject: [PATCH 109/123] Adding and updating content in the spacy universe
 (#10493)

* signing contributor agreement

* adding new content to the spaCy universe

* updating outdated example codes

* resolving issues for the PR

* resolve review for klayers

* remove contributor-agreement file from the PR

* Update code example of spaCySentiWS

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update spacy-sentiws code example

Co-authored-by: schaeran <schaeran1994@gmail.com>
Co-authored-by: schaeran <schaeran@explosion.ai>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/meta/universe.json | 228 ++++++++++++++++++++++++++++++-------
 1 file changed, 186 insertions(+), 42 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 7a0b43b77..ccd75c0c3 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,69 @@
 {
     "resources": [
+        {
+            "id": "scrubadub_spacy",
+            "title": "scrubadub_spacy",
+            "category": ["pipeline"],
+            "slogan": "Remove personally identifiable information from text using spaCy.",
+            "description": "scrubadub removes personally identifiable information from text. scrubadub_spacy is an extension that uses spaCy NLP models to remove personal information from text.",
+            "github": "LeapBeyond/scrubadub_spacy",
+            "pip": "scrubadub-spacy",
+            "url": "https://github.com/LeapBeyond/scrubadub_spacy",
+            "code_language": "python",
+            "author": "Leap Beyond",
+            "author_links": {
+                "github": "https://github.com/LeapBeyond",
+                "website": "https://leapbeyond.ai"
+            },
+            "code_example": [
+                "import scrubadub, scrubadub_spacy",
+                "scrubber = scrubadub.Scrubber()",
+                "scrubber.add_detector(scrubadub_spacy.detectors.SpacyEntityDetector)",
+                "print(scrubber.clean(\"My name is Alex, I work at LifeGuard in London, and my eMail is alex@lifeguard.com btw. my super secret twitter login is username: alex_2000 password: g-dragon180888\"))",
+                "# My name is {{NAME}}, I work at {{ORGANIZATION}} in {{LOCATION}}, and my eMail is {{EMAIL}} btw. my super secret twitter login is username: {{USERNAME}} password: {{PASSWORD}}"
+            ]
+        },
+        {
+            "id": "spacy-setfit-textcat",
+            "title": "spacy-setfit-textcat",
+            "category": ["research"],
+            "tags": ["SetFit", "Few-Shot"],
+            "slogan": "spaCy Project: Experiments with SetFit & Few-Shot Classification",
+            "description": "This project is an experiment with spaCy and few-shot text classification using SetFit",
+            "github": "pmbaumgartner/spacy-setfit-textcat",
+            "url": "https://github.com/pmbaumgartner/spacy-setfit-textcat",
+            "code_language": "python",
+            "author": "Peter Baumgartner",
+            "author_links": {
+                "twitter" : "https://twitter.com/pmbaumgartner",
+                "github": "https://github.com/pmbaumgartner",
+                "website": "https://www.peterbaumgartner.com/"
+            },
+            "code_example": [
+                "https://colab.research.google.com/drive/1CvGEZC0I9_v8gWrBxSJQ4Z8JGPJz-HYb?usp=sharing"
+            ]
+        },
+        {
+            "id": "spacy-experimental",
+            "title": "spacy-experimental",
+            "category": ["extension"],
+            "slogan": "Cutting-edge experimental spaCy components and features",
+            "description": "This package includes experimental components and features for spaCy v3.x, for example model architectures, pipeline components and utilities.",
+            "github": "explosion/spacy-experimental",
+            "pip": "spacy-experimental",
+            "url": "https://github.com/explosion/spacy-experimental",
+            "code_language": "python",
+            "author": "Explosion",
+            "author_links": {
+                "twitter" : "https://twitter.com/explosion_ai",
+                "github": "https://github.com/explosion",
+                "website": "https://explosion.ai/"
+            },
+            "code_example": [
+                "python -m pip install -U pip setuptools wheel",
+                "python -m pip install spacy-experimental"
+            ]
+        },
         {
             "id": "spacypdfreader",
             "title": "spadypdfreader",
@@ -327,15 +391,20 @@
             "pip": "spaczz",
             "code_example": [
                 "import spacy",
-                "from spaczz.pipeline import SpaczzRuler",
+                "from spaczz.matcher import FuzzyMatcher",
                 "",
-                "nlp = spacy.blank('en')",
-                "ruler = SpaczzRuler(nlp)",
-                "ruler.add_patterns([{'label': 'PERSON', 'pattern': 'Bill Gates', 'type': 'fuzzy'}])",
-                "nlp.add_pipe(ruler)",
+                "nlp = spacy.blank(\"en\")",
+                "text = \"\"\"Grint Anderson created spaczz in his home at 555 Fake St,",
+                "Apt 5 in Nashv1le, TN 55555-1234 in the US.\"\"\"  # Spelling errors intentional.",
+                "doc = nlp(text)",
                 "",
-                "doc = nlp('Oops, I spelled Bill Gatez wrong.')",
-                "print([(ent.text, ent.start, ent.end, ent.label_) for ent in doc.ents])"
+                "matcher = FuzzyMatcher(nlp.vocab)",
+                "matcher.add(\"NAME\", [nlp(\"Grant Andersen\")])",
+                "matcher.add(\"GPE\", [nlp(\"Nashville\")])",
+                "matches = matcher(doc)",
+                "",
+                "for match_id, start, end, ratio in matches:",
+                "    print(match_id, doc[start:end], ratio)"
             ],
             "code_language": "python",
             "url": "https://spaczz.readthedocs.io/en/latest/",
@@ -442,6 +511,84 @@
                 "website": "https://koaning.io"
             }
         },
+        {
+            "id": "Klayers",
+            "title": "Klayers",
+            "category": ["pipeline"],
+            "tags": ["AWS"],
+            "slogan": "spaCy as a AWS Lambda Layer",
+            "description": "A collection of Python Packages as AWS Lambda(λ) Layers",
+            "github": "keithrozario/Klayers",
+            "pip": "",
+            "url": "https://github.com/keithrozario/Klayers",
+            "code_language": "python",
+            "author": "Keith Rozario",
+            "author_links": {
+                "twitter" : "https://twitter.com/keithrozario",
+                "github": "https://github.com/keithrozario",
+                "website": "https://www.keithrozario.com"
+            },
+            "code_example": [
+                "# SAM Template",
+                "MyLambdaFunction:",
+                "    Type: AWS::Serverless::Function",
+                "    Handler: 02_pipeline/spaCy.main",
+                "    Description: Name Entity Extraction",
+                "    Runtime: python3.8",
+                "    Layers:",
+                "        - arn:aws:lambda:${self:provider.region}:113088814899:layer:Klayers-python37-spacy:18"
+            ]
+        },
+        {
+            "type": "education",
+            "id": "video-spacys-ner-model-alt",
+            "title": "Named Entity Recognition (NER) using spaCy",
+            "slogan": "",
+            "description": "In this video, I show you how to do named entity recognition using the spaCy library for Python.",
+            "youtube": "Gn_PjruUtrc",
+            "author": "Applied Language Technology",
+            "author_links": {
+                "twitter": "HelsinkiNLP",
+                "github": "Applied-Language-Technology",
+                "website": "https://applied-language-technology.mooc.fi/"
+            },
+            "category": ["videos"]
+        },
+        {
+            "id": "HuSpaCy",
+            "title": "HuSpaCy",
+            "category": ["models"],
+            "tags": ["Hungarian"],
+            "slogan": "HuSpaCy: industrial-strength Hungarian natural language processing",
+            "description": "HuSpaCy is a spaCy model and a library providing industrial-strength Hungarian language processing facilities.",
+            "github": "huspacy/huspacy",
+            "pip": "huspacy",
+            "url": "https://github.com/huspacy/huspacy",
+            "code_language": "python",
+            "author": "SzegedAI",
+            "author_links": {
+                "github": "https://szegedai.github.io/",
+                "website": "https://u-szeged.hu/english"
+            },
+            "code_example": [
+                "# Load the model using huspacy",
+                "import huspacy",
+                "",
+                "nlp = huspacy.load()",
+                "",
+                "# Load the mode using spacy.load()",
+                "import spacy",
+                "",
+                "nlp = spacy.load(\"hu_core_news_lg\")",
+                "",
+                "# Load the model directly as a module",
+                "import hu_core_news_lg",
+                "",
+                "nlp = hu_core_news_lg.load()\n",
+                "# Either way you get the same model and can start processing texts.",
+                "doc = nlp(\"Csiribiri csiribiri zabszalma - négy csillag közt alszom ma.\")"
+            ]
+        },
         {
             "id": "spacy-stanza",
             "title": "spacy-stanza",
@@ -620,18 +767,17 @@
                 "import spacy",
                 "from spacymoji import Emoji",
                 "",
-                "nlp = spacy.load('en')",
-                "emoji = Emoji(nlp)",
-                "nlp.add_pipe(emoji, first=True)",
+                "nlp = spacy.load(\"en_core_web_sm\")",
+                "nlp.add_pipe(\"emoji\", first=True)",
+                "doc = nlp(\"This is a test 😻 👍🏿\")",
                 "",
-                "doc = nlp('This is a test 😻 👍🏿')",
-                "assert doc._.has_emoji == True",
-                "assert doc[2:5]._.has_emoji == True",
-                "assert doc[0]._.is_emoji == False",
-                "assert doc[4]._.is_emoji == True",
-                "assert doc[5]._.emoji_desc == 'thumbs up dark skin tone'",
+                "assert doc._.has_emoji is True",
+                "assert doc[2:5]._.has_emoji is True",
+                "assert doc[0]._.is_emoji is False",
+                "assert doc[4]._.is_emoji is True",
+                "assert doc[5]._.emoji_desc == \"thumbs up dark skin tone\"",
                 "assert len(doc._.emoji) == 2",
-                "assert doc._.emoji[1] == ('👍🏿', 5, 'thumbs up dark skin tone')"
+                "assert doc._.emoji[1] == (\"👍🏿\", 5, \"thumbs up dark skin tone\")"
             ],
             "author": "Ines Montani",
             "author_links": {
@@ -868,9 +1014,8 @@
                 "import spacy",
                 "from spacy_sentiws import spaCySentiWS",
                 "",
-                "nlp = spacy.load('de')",
-                "sentiws = spaCySentiWS(sentiws_path='data/sentiws/')",
-                "nlp.add_pipe(sentiws)",
+                "nlp = spacy.load('de_core_news_sm')",
+                "nlp.add_pipe('sentiws', config={'sentiws_path': 'data/sentiws'})",
                 "doc = nlp('Die Dummheit der Unterwerfung blüht in hübschen Farben.')",
                 "",
                 "for token in doc:",
@@ -3018,18 +3163,25 @@
                 "import spacy",
                 "import pytextrank",
                 "",
-                "nlp = spacy.load('en_core_web_sm')",
+                "# example text",
+                "text = \"\"\"Compatibility of systems of linear constraints over the set of natural numbers.",
+                "Criteria of compatibility of a system of linear Diophantine equations, strict inequations,",
+                "and nonstrict inequations are considered. Upper bounds for components of a minimal set of",
+                "solutions and algorithms of construction of minimal generating sets of solutions for all types",
+                "of systems are given. These criteria and the corresponding algorithms for constructing a minimal",
+                "supporting set of solutions can be used in solving all the considered types systems and systems of mixed types.\"\"\"",
                 "",
-                "tr = pytextrank.TextRank()",
-                "nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)",
+                "# load a spaCy model, depending on language, scale, etc.",
+                "nlp = spacy.load(\"en_core_web_sm\")",
+                "# add PyTextRank to the spaCy pipeline",
+                "nlp.add_pipe(\"textrank\")",
                 "",
-                "text = 'Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered.'",
                 "doc = nlp(text)",
-                "",
                 "# examine the top-ranked phrases in the document",
-                "for p in doc._.phrases:",
-                "    print('{:.4f} {:5d}  {}'.format(p.rank, p.count, p.text))",
-                "    print(p.chunks)"
+                "for phrase in doc._.phrases:",
+                "    print(phrase.text)",
+                "    print(phrase.rank, phrase.count)",
+                "    print(phrase.chunks)"
             ],
             "code_language": "python",
             "url": "https://github.com/DerwenAI/pytextrank/wiki",
@@ -3055,21 +3207,13 @@
                 "import spacy",
                 "from spacy_syllables import SpacySyllables",
                 "",
-                "nlp = spacy.load('en_core_web_sm')",
-                "syllables = SpacySyllables(nlp)",
-                "nlp.add_pipe(syllables, after='tagger')",
+                "nlp = spacy.load(\"en_core_web_sm\")",
+                "nlp.add_pipe(\"syllables\", after=\"tagger\")",
                 "",
-                "doc = nlp('terribly long')",
-                "",
-                "data = [",
-                "    (token.text, token._.syllables, token._.syllables_count)",
-                "    for token in doc",
-                "]",
-                "",
-                "assert data == [",
-                "    ('terribly', ['ter', 'ri', 'bly'], 3),",
-                "    ('long', ['long'], 1)",
-                "]"
+                "assert nlp.pipe_names == [\"tok2vec\", \"tagger\", \"syllables\", \"parser\",  \"attribute_ruler\", \"lemmatizer\", \"ner\"]",
+                "doc = nlp(\"terribly long\")",
+                "data = [(token.text, token..syllables, token..syllables_count) for token in doc]",
+                "assert data == [(\"terribly\", [\"ter\", \"ri\", \"bly\"], 3), (\"long\", [\"long\"], 1)]"
             ],
             "thumb": "https://raw.githubusercontent.com/sloev/spacy-syllables/master/logo.png",
             "author": "Johannes Valbjørn",

From 229ecaf0ea69ad586587ea70b8a90d59e0e64005 Mon Sep 17 00:00:00 2001
From: Duy Ngo <91310922+thebugcreator@users.noreply.github.com>
Date: Mon, 18 Apr 2022 12:58:32 +0200
Subject: [PATCH 110/123] Add numbers and definitions (#10665)

---
 spacy/lang/vi/lex_attrs.py | 39 ++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py
index 33a3745cc..9f931446f 100644
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@@ -2,22 +2,29 @@ from ...attrs import LIKE_NUM
 
 
 _num_words = [
-    "không",
-    "một",
-    "hai",
-    "ba",
-    "bốn",
-    "năm",
-    "sáu",
-    "bảy",
-    "bẩy",
-    "tám",
-    "chín",
-    "mười",
-    "chục",
-    "trăm",
-    "nghìn",
-    "tỷ",
+    "không",    # Zero
+    "một",      # One
+    "mốt",      # Also one, irreplacable in niché cases for unit digit such as "51"="năm mươi mốt"
+    "hai",      # Two
+    "ba",       # Three
+    "bốn",      # Four
+    "tư",       # Also four, used in certain cases for unit digit such as "54"="năm mươi tư"
+    "năm",      # Five
+    "lăm",      # Also five, irreplacable in niché cases for unit digit such as "55"="năm mươi lăm"
+    "sáu",      # Six
+    "bảy",      # Seven
+    "bẩy",      # Also seven, old fashioned
+    "tám",      # Eight
+    "chín",     # Nine
+    "mười",     # Ten
+    "chục",     # Also ten, used for counting in tens such as "20 eggs"="hai chục trứng"
+    "trăm",     # Hundred
+    "nghìn",    # Thousand
+    "ngàn",     # Also thousand, used in the south
+    "vạn",      # Ten thousand
+    "triệu",    # Million
+    "tỷ",       # Billion
+    "tỉ"        # Also billion, used in combinatorics such as "tỉ_phú"="billionaire"
 ]
 
 

From aa6780eb27f9d37b4db017061de1b6c2ba753632 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Mon, 18 Apr 2022 12:59:34 +0200
Subject: [PATCH 111/123] `Matcher`: Remove superfluous GIL-acquiring check in
 `get_is_final` (#10659)

* `Matcher`: Remove superfluous GIL-acquiring check in `get_is_final`

This check incurred a significant performance penalty due to  implict interactions between the GIL and Cython ref-counting code.

* `Matcher`: Inline `PatternStateC` accessors
---
 spacy/matcher/matcher.pyx | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index e75ee9ce2..e43583e30 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -690,18 +690,14 @@ cdef int8_t get_is_match(PatternStateC state,
     return True
 
 
-cdef int8_t get_is_final(PatternStateC state) nogil:
+cdef inline int8_t get_is_final(PatternStateC state) nogil:
     if state.pattern[1].quantifier == FINAL_ID:
-        id_attr = state.pattern[1].attrs[0]
-        if id_attr.attr != ID:
-            with gil:
-                raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr))
         return 1
     else:
         return 0
 
 
-cdef int8_t get_quantifier(PatternStateC state) nogil:
+cdef inline int8_t get_quantifier(PatternStateC state) nogil:
     return state.pattern.quantifier
 
 

From 2a2654c756be2296777f0e6e70a0b62533e4e262 Mon Sep 17 00:00:00 2001
From: mgr <mgrojo@gmail.com>
Date: Mon, 18 Apr 2022 22:04:02 +0200
Subject: [PATCH 112/123] Remove significant or not very frequent words from
 stop word list [es]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The list of stop words for Spanish contained many inadequate words, see:

https://github.com/explosion/spaCy/issues/3052#issuecomment-1100760100

Removed words:
- verb forms of 'trabajar' (work) and intentar (try)
- words related to 'empleo' (employment)
- incorrect words: ampleamos, arribaabajo, soyos, paìs
- miscellaneous words due to being too significant of too infrequent:
  actualmente, aproximadamente, antaño, cosas, ejemplo, horas, general,
  pais, principalmente, raras

Added other stop words for completion:
- Spanish one-letter words
- numbers up to twelve

Some reformatting to 79 columns.

When in doubt, the English and German lists have been consulted as good
examples.
---
 spacy/lang/es/stop_words.py | 84 ++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py
index 004df4fca..6d2885481 100644
--- a/spacy/lang/es/stop_words.py
+++ b/spacy/lang/es/stop_words.py
@@ -1,82 +1,80 @@
 STOP_WORDS = set(
     """
-actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
-al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos
-antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas
-aquello aquellos aqui aquél aquélla aquéllas aquéllos aquí arriba arribaabajo
-aseguró asi así atras aun aunque ayer añadió aún
+a acuerdo adelante ademas además afirmó agregó ahi ahora ahí al algo alguna
+algunas alguno algunos algún alli allí alrededor ambos ante anterior antes
+apenas aproximadamente aquel aquella aquellas aquello aquellos aqui aquél
+aquélla aquéllas aquéllos aquí arriba aseguró asi así atras aun aunque añadió
+aún
 
 bajo bastante bien breve buen buena buenas bueno buenos
 
-cada casi cerca cierta ciertas cierto ciertos cinco claro comentó como con
-conmigo conocer conseguimos conseguir considera consideró consigo consigue
-consiguen consigues contigo contra cosas creo cual cuales cualquier cuando
-cuanta cuantas cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas
-cuánto cuántos cómo
+cada casi cierta ciertas cierto ciertos cinco claro comentó como con conmigo
+conocer conseguimos conseguir considera consideró consigo consigue consiguen
+consigues contigo contra creo cual cuales cualquier cuando cuanta cuantas
+cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas cuánto cuántos
+cómo
 
 da dado dan dar de debajo debe deben debido decir dejó del delante demasiado
 demás dentro deprisa desde despacio despues después detras detrás dia dias dice
-dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante día
-días dónde
+dicen dicho dieron diez diferente diferentes dijeron dijo dio doce donde dos
+durante día días dónde
 
-ejemplo el ella ellas ello ellos embargo empleais emplean emplear empleas
-empleo en encima encuentra enfrente enseguida entonces entre era eramos eran
-eras eres es esa esas ese eso esos esta estaba estaban estado estados estais
-estamos estan estar estará estas este esto estos estoy estuvo está están ex
-excepto existe existen explicó expresó él ésa ésas ése ésos ésta éstas éste
-éstos
+e el ella ellas ello ellos embargo en encima encuentra enfrente enseguida
+entonces entre era eramos eran eras eres es esa esas ese eso esos esta estaba
+estaban estado estados estais estamos estan estar estará estas este esto estos
+estoy estuvo está están excepto existe existen explicó expresó él ésa ésas ése
+ésos ésta éstas éste éstos
 
 fin final fue fuera fueron fui fuimos
 
-general gran grandes gueno
+gran grande grandes
 
 ha haber habia habla hablan habrá había habían hace haceis hacemos hacen hacer
 hacerlo haces hacia haciendo hago han hasta hay haya he hecho hemos hicieron
-hizo horas hoy hubo
+hizo hoy hubo
 
-igual incluso indicó informo informó intenta intentais intentamos intentan
-intentar intentas intento ir
+igual incluso indicó informo informó ir
 
 junto
 
-la lado largo las le lejos les llegó lleva llevar lo los luego lugar
+la lado largo las le les llegó lleva llevar lo los luego
 
 mal manera manifestó mas mayor me mediante medio mejor mencionó menos menudo mi
-mia mias mientras mio mios mis misma mismas mismo mismos modo momento mucha
-muchas mucho muchos muy más mí mía mías mío míos
+mia mias mientras mio mios mis misma mismas mismo mismos modo mucha muchas
+mucho muchos muy más mí mía mías mío míos
 
 nada nadie ni ninguna ningunas ninguno ningunos ningún no nos nosotras nosotros
-nuestra nuestras nuestro nuestros nueva nuevas nuevo nuevos nunca
+nuestra nuestras nuestro nuestros nueva nuevas nueve nuevo nuevos nunca
 
-ocho os otra otras otro otros
+o ocho once os otra otras otro otros
 
-pais para parece parte partir pasada pasado paìs peor pero pesar poca pocas
-poco pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
+para parece parte partir pasada pasado paìs peor pero pesar poca pocas poco
+pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
 podrán podría podrían poner por porque posible primer primera primero primeros
-principalmente pronto propia propias propio propios proximo próximo próximos
-pudo pueda puede pueden puedo pues
+pronto propia propias propio propios proximo próximo próximos pudo pueda puede
+pueden puedo pues
 
-qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién quiénes qué
+qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién
+quiénes qué
 
-raras realizado realizar realizó repente respecto
+realizado realizar realizó repente respecto
 
 sabe sabeis sabemos saben saber sabes salvo se sea sean segun segunda segundo
 según seis ser sera será serán sería señaló si sido siempre siendo siete sigue
-siguiente sin sino sobre sois sola solamente solas solo solos somos son soy
-soyos su supuesto sus suya suyas suyo sé sí sólo
+siguiente sin sino sobre sois sola solamente solas solo solos somos son soy su
+supuesto sus suya suyas suyo suyos sé sí sólo
 
 tal tambien también tampoco tan tanto tarde te temprano tendrá tendrán teneis
-tenemos tener tenga tengo tenido tenía tercera ti tiempo tiene tienen toda
-todas todavia todavía todo todos total trabaja trabajais trabajamos trabajan
-trabajar trabajas trabajo tras trata través tres tu tus tuvo tuya tuyas tuyo
-tuyos tú
+tenemos tener tenga tengo tenido tenía tercera tercero ti tiene tienen toda
+todas todavia todavía todo todos total tras trata través tres tu tus tuvo tuya
+tuyas tuyo tuyos tú
 
-ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
+u ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
 última últimas último últimos
 
-va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero
-vez vosotras vosotros voy vuestra vuestras vuestro vuestros
+va vais vamos van varias varios vaya veces ver verdad verdadera verdadero vez
+vosotras vosotros voy vuestra vuestras vuestro vuestros
 
-ya yo
+y ya yo
 """.split()
 )

From 3d50b1a9898d91c1a3bf796af0e849020d564480 Mon Sep 17 00:00:00 2001
From: mgr <mgrojo@gmail.com>
Date: Mon, 18 Apr 2022 22:12:57 +0200
Subject: [PATCH 113/123] Fix some issues in Spanish examples

- Spelling: nationalities in lowercase, accent.
- Incorrect verb composition
- Untranslated word
---
 spacy/lang/es/examples.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py
index 2bcbd8740..e4dfbcb6d 100644
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@@ -9,14 +9,14 @@ Example sentences to test spaCy and its language models.
 sentences = [
     "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.",
     "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.",
-    "San Francisco analiza prohibir los robots delivery.",
+    "San Francisco analiza prohibir los robots de reparto.",
     "Londres es una gran ciudad del Reino Unido.",
     "El gato come pescado.",
     "Veo al hombre con el telescopio.",
     "La araña come moscas.",
     "El pingüino incuba en su nido sobre el hielo.",
-    "¿Dónde estais?",
-    "¿Quién es el presidente Francés?",
-    "¿Dónde está encuentra la capital de Argentina?",
+    "¿Dónde estáis?",
+    "¿Quién es el presidente francés?",
+    "¿Dónde se encuentra la capital de Argentina?",
     "¿Cuándo nació José de San Martín?",
 ]

From 29afbdb91e5fecf513125a85f1ac1d165f40bc93 Mon Sep 17 00:00:00 2001
From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com>
Date: Wed, 20 Apr 2022 10:52:34 +0300
Subject: [PATCH 114/123] add readme for explosion-bot (#10677)

---
 extra/DEVELOPER_DOCS/ExplosionBot.md | 32 ++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 extra/DEVELOPER_DOCS/ExplosionBot.md

diff --git a/extra/DEVELOPER_DOCS/ExplosionBot.md b/extra/DEVELOPER_DOCS/ExplosionBot.md
new file mode 100644
index 000000000..403625550
--- /dev/null
+++ b/extra/DEVELOPER_DOCS/ExplosionBot.md
@@ -0,0 +1,32 @@
+# Explosion-bot
+
+Explosion-bot is a robot that can be invoked to help with running particular test commands.
+
+## Permissions
+
+Only maintainers have permissions to summon explosion-bot. Each of the open source repos that use explosion-bot has its own team(s) of maintainers, and only github users who are members of those teams can successfully run bot commands.
+
+## Running robot commands
+
+To summon the robot, write a github comment on the issue/PR you wish to test. The comment must be in the following format:
+
+```
+@explosion-bot please test_gpu
+```
+
+Some things to note:
+
+* The `@explosion-bot please` must be the beginning of the command - you cannot add anything in front of this or else the robot won't know how to parse it. Adding anything at the end aside from the test name will also confuse the robot, so keep it simple!
+* The command name (such as `test_gpu`) must be one of the tests that the bot knows how to run. The available commands are documented in the bot's [workflow config](https://github.com/explosion/spaCy/blob/master/.github/workflows/explosionbot.yml#L26) and must match exactly one of the commands listed there.
+* The robot can't do multiple things at once, so if you want it to run multiple tests, you'll have to summon it with one comment per test.
+* For the `test_gpu` command, you can specify an optional thinc branch (from the spaCy repo) or a spaCy branch (from the thinc repo) with either the `--thinc-branch` or `--spacy-branch` flags. By default, the bot will pull in the PR branch from the repo where the command was issued, and the main branch of the other repository. However, if you need to run against another branch, you can say (for example):
+
+```
+@explosion-bot please test_gpu --thinc-branch develop
+```
+
+## Troubleshooting
+
+If the robot isn't responding to commands as expected, you can check its logs in the [Github Action](https://github.com/explosion/spaCy/actions/workflows/explosionbot.yml). 
+
+For each command sent to the bot, there should be a run of the `explosion-bot` workflow. In the `Install and run explosion-bot` step, towards the ends of the logs you should see info about the configuration that the bot was run with, as well as any errors that the bot encountered.
\ No newline at end of file

From 2c2dbb844c784937d7664a97239c95bbf86326ed Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 21 Apr 2022 12:55:22 +0200
Subject: [PATCH 115/123] Syntax for a branch from a PR

---
 extra/DEVELOPER_DOCS/ExplosionBot.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/extra/DEVELOPER_DOCS/ExplosionBot.md b/extra/DEVELOPER_DOCS/ExplosionBot.md
index 403625550..eebec1a06 100644
--- a/extra/DEVELOPER_DOCS/ExplosionBot.md
+++ b/extra/DEVELOPER_DOCS/ExplosionBot.md
@@ -24,9 +24,13 @@ Some things to note:
 ```
 @explosion-bot please test_gpu --thinc-branch develop
 ```
+You can also specify a branch from an unmerged PR:
+```
+@explosion-bot please test_gpu --thinc-branch refs/pull/633/head
+```
 
 ## Troubleshooting
 
 If the robot isn't responding to commands as expected, you can check its logs in the [Github Action](https://github.com/explosion/spaCy/actions/workflows/explosionbot.yml). 
 
-For each command sent to the bot, there should be a run of the `explosion-bot` workflow. In the `Install and run explosion-bot` step, towards the ends of the logs you should see info about the configuration that the bot was run with, as well as any errors that the bot encountered.
\ No newline at end of file
+For each command sent to the bot, there should be a run of the `explosion-bot` workflow. In the `Install and run explosion-bot` step, towards the ends of the logs you should see info about the configuration that the bot was run with, as well as any errors that the bot encountered.

From e07500369c654cbd3bf2090838a88a00c600ae2f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 22 Apr 2022 11:24:53 +0200
Subject: [PATCH 116/123] Auto-format code with black (#10687)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 spacy/lang/vi/lex_attrs.py | 46 +++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py
index 9f931446f..0cbda4ffb 100644
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@@ -2,29 +2,29 @@ from ...attrs import LIKE_NUM
 
 
 _num_words = [
-    "không",    # Zero
-    "một",      # One
-    "mốt",      # Also one, irreplacable in niché cases for unit digit such as "51"="năm mươi mốt"
-    "hai",      # Two
-    "ba",       # Three
-    "bốn",      # Four
-    "tư",       # Also four, used in certain cases for unit digit such as "54"="năm mươi tư"
-    "năm",      # Five
-    "lăm",      # Also five, irreplacable in niché cases for unit digit such as "55"="năm mươi lăm"
-    "sáu",      # Six
-    "bảy",      # Seven
-    "bẩy",      # Also seven, old fashioned
-    "tám",      # Eight
-    "chín",     # Nine
-    "mười",     # Ten
-    "chục",     # Also ten, used for counting in tens such as "20 eggs"="hai chục trứng"
-    "trăm",     # Hundred
-    "nghìn",    # Thousand
-    "ngàn",     # Also thousand, used in the south
-    "vạn",      # Ten thousand
-    "triệu",    # Million
-    "tỷ",       # Billion
-    "tỉ"        # Also billion, used in combinatorics such as "tỉ_phú"="billionaire"
+    "không",  # Zero
+    "một",  # One
+    "mốt",  # Also one, irreplacable in niché cases for unit digit such as "51"="năm mươi mốt"
+    "hai",  # Two
+    "ba",  # Three
+    "bốn",  # Four
+    "tư",  # Also four, used in certain cases for unit digit such as "54"="năm mươi tư"
+    "năm",  # Five
+    "lăm",  # Also five, irreplacable in niché cases for unit digit such as "55"="năm mươi lăm"
+    "sáu",  # Six
+    "bảy",  # Seven
+    "bẩy",  # Also seven, old fashioned
+    "tám",  # Eight
+    "chín",  # Nine
+    "mười",  # Ten
+    "chục",  # Also ten, used for counting in tens such as "20 eggs"="hai chục trứng"
+    "trăm",  # Hundred
+    "nghìn",  # Thousand
+    "ngàn",  # Also thousand, used in the south
+    "vạn",  # Ten thousand
+    "triệu",  # Million
+    "tỷ",  # Billion
+    "tỉ",  # Also billion, used in combinatorics such as "tỉ_phú"="billionaire"
 ]
 
 

From 3b208197c3d7288c57bfe9831a3334b4662b416f Mon Sep 17 00:00:00 2001
From: Mike <umaxfun@gmail.com>
Date: Mon, 25 Apr 2022 16:40:54 +0200
Subject: [PATCH 117/123] Fixed example for spacy_syllables (#10705)

There was a typo in the example for the spacy_syllables project.
---
 website/meta/universe.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index ccd75c0c3..e67c78716 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -3212,7 +3212,7 @@
                 "",
                 "assert nlp.pipe_names == [\"tok2vec\", \"tagger\", \"syllables\", \"parser\",  \"attribute_ruler\", \"lemmatizer\", \"ner\"]",
                 "doc = nlp(\"terribly long\")",
-                "data = [(token.text, token..syllables, token..syllables_count) for token in doc]",
+                "data = [(token.text, token._.syllables, token._.syllables_count) for token in doc]",
                 "assert data == [(\"terribly\", [\"ter\", \"ri\", \"bly\"], 3), (\"long\", [\"long\"], 1)]"
             ],
             "thumb": "https://raw.githubusercontent.com/sloev/spacy-syllables/master/logo.png",

From 455f089c9bdbd8bb0951daeabb98f4c41d6c55f9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 25 Apr 2022 18:19:03 +0200
Subject: [PATCH 118/123] Support exclude in Doc.from_docs (#10689)

* Support exclude in Doc.from_docs

* Update API docs

* Add new tag to docs
---
 spacy/tests/doc/test_doc_api.py | 23 +++++++++++
 spacy/tokens/doc.pyx            | 69 +++++++++++++++++++--------------
 website/docs/api/doc.md         | 19 +++++----
 3 files changed, 74 insertions(+), 37 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 858c7cbb6..19b554572 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -1,6 +1,7 @@
 import weakref
 
 import numpy
+from numpy.testing import assert_array_equal
 import pytest
 from thinc.api import NumpyOps, get_current_ops
 
@@ -634,6 +635,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert "group" in m_doc.spans
     assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 
+    # can exclude spans
+    m_doc = Doc.from_docs(en_docs, exclude=["spans"])
+    assert "group" not in m_doc.spans
+
+    # can exclude user_data
+    m_doc = Doc.from_docs(en_docs, exclude=["user_data"])
+    assert m_doc.user_data == {}
+
     # can merge empty docs
     doc = Doc.from_docs([en_tokenizer("")] * 10)
 
@@ -647,6 +656,20 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert "group" in m_doc.spans
     assert len(m_doc.spans["group"]) == 0
 
+    # with tensor
+    ops = get_current_ops()
+    for doc in en_docs:
+        doc.tensor = ops.asarray([[len(t.text), 0.0] for t in doc])
+    m_doc = Doc.from_docs(en_docs)
+    assert_array_equal(
+        ops.to_numpy(m_doc.tensor),
+        ops.to_numpy(ops.xp.vstack([doc.tensor for doc in en_docs if len(doc)])),
+    )
+
+    # can exclude tensor
+    m_doc = Doc.from_docs(en_docs, exclude=["tensor"])
+    assert m_doc.tensor.shape == (0,)
+
 
 def test_doc_api_from_docs_ents(en_tokenizer):
     texts = ["Merging the docs is fun.", "They don't think alike."]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 1a48705fd..c36e3a02f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -11,7 +11,7 @@ from enum import Enum
 import itertools
 import numpy
 import srsly
-from thinc.api import get_array_module
+from thinc.api import get_array_module, get_current_ops
 from thinc.util import copy_array
 import warnings
 
@@ -1108,14 +1108,19 @@ cdef class Doc:
         return self
 
     @staticmethod
-    def from_docs(docs, ensure_whitespace=True, attrs=None):
+    def from_docs(docs, ensure_whitespace=True, attrs=None, *, exclude=tuple()):
         """Concatenate multiple Doc objects to form a new one. Raises an error
         if the `Doc` objects do not all share the same `Vocab`.
 
         docs (list): A list of Doc objects.
-        ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
-        attrs (list): Optional list of attribute ID ints or attribute name strings.
-        RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
+        ensure_whitespace (bool): Insert a space between two adjacent docs
+            whenever the first doc does not end in whitespace.
+        attrs (list): Optional list of attribute ID ints or attribute name
+            strings.
+        exclude (Iterable[str]): Doc attributes to exclude. Supported
+            attributes: `spans`, `tensor`, `user_data`.
+        RETURNS (Doc): A doc that contains the concatenated docs, or None if no
+            docs were given.
 
         DOCS: https://spacy.io/api/doc#from_docs
         """
@@ -1145,31 +1150,33 @@ cdef class Doc:
             concat_words.extend(t.text for t in doc)
             concat_spaces.extend(bool(t.whitespace_) for t in doc)
 
-            for key, value in doc.user_data.items():
-                if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
-                    data_type, name, start, end = key
-                    if start is not None or end is not None:
-                        start += char_offset
-                        if end is not None:
-                            end += char_offset
-                        concat_user_data[(data_type, name, start, end)] = copy.copy(value)
+            if "user_data" not in exclude:
+                for key, value in doc.user_data.items():
+                    if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
+                        data_type, name, start, end = key
+                        if start is not None or end is not None:
+                            start += char_offset
+                            if end is not None:
+                                end += char_offset
+                            concat_user_data[(data_type, name, start, end)] = copy.copy(value)
+                        else:
+                            warnings.warn(Warnings.W101.format(name=name))
                     else:
-                        warnings.warn(Warnings.W101.format(name=name))
-                else:
-                    warnings.warn(Warnings.W102.format(key=key, value=value))
-            for key in doc.spans:
-                # if a spans key is in any doc, include it in the merged doc
-                # even if it is empty
-                if key not in concat_spans:
-                    concat_spans[key] = []
-                for span in doc.spans[key]:
-                    concat_spans[key].append((
-                        span.start_char + char_offset,
-                        span.end_char + char_offset,
-                        span.label,
-                        span.kb_id,
-                        span.text, # included as a check
-                    ))
+                        warnings.warn(Warnings.W102.format(key=key, value=value))
+            if "spans" not in exclude:
+                for key in doc.spans:
+                    # if a spans key is in any doc, include it in the merged doc
+                    # even if it is empty
+                    if key not in concat_spans:
+                        concat_spans[key] = []
+                    for span in doc.spans[key]:
+                        concat_spans[key].append((
+                            span.start_char + char_offset,
+                            span.end_char + char_offset,
+                            span.label,
+                            span.kb_id,
+                            span.text, # included as a check
+                        ))
             char_offset += len(doc.text)
             if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
                 char_offset += 1
@@ -1210,6 +1217,10 @@ cdef class Doc:
                 else:
                     raise ValueError(Errors.E873.format(key=key, text=text))
 
+        if "tensor" not in exclude and any(len(doc) for doc in docs):
+            ops = get_current_ops()
+            concat_doc.tensor = ops.xp.vstack([ops.asarray(doc.tensor) for doc in docs if len(doc)])
+
         return concat_doc
 
     def get_lca_matrix(self):
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index c28509ab0..c929a4a06 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | Name                                     | Description                                                                                                                                                                                        |
 | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                   |
-| `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                                 |
+| `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                       |
 | `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~       |
 | _keyword-only_                           |                                                                                                                                                                                                    |
 | `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                 |
@@ -304,7 +304,8 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
 
 ## Doc.has_annotation {#has_annotation tag="method"}
 
-Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes).
+Check whether the doc contains annotation on a
+[`Token` attribute](/api/token#attributes).
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -398,12 +399,14 @@ Concatenate multiple `Doc` objects to form a new one. Raises an error if the
 >        [str(ent) for doc in docs for ent in doc.ents]
 > ```
 
-| Name                | Description                                                                                                       |
-| ------------------- | ----------------------------------------------------------------------------------------------------------------- |
-| `docs`              | A list of `Doc` objects. ~~List[Doc]~~                                                                            |
-| `ensure_whitespace` | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. ~~bool~~              |
-| `attrs`             | Optional list of attribute ID ints or attribute name strings. ~~Optional[List[Union[str, int]]]~~                 |
-| **RETURNS**         | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. ~~Optional[Doc]~~ |
+| Name                                   | Description                                                                                                       |
+| -------------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
+| `docs`                                 | A list of `Doc` objects. ~~List[Doc]~~                                                                            |
+| `ensure_whitespace`                    | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. ~~bool~~              |
+| `attrs`                                | Optional list of attribute ID ints or attribute name strings. ~~Optional[List[Union[str, int]]]~~                 |
+| _keyword-only_                         |                                                                                                                   |
+| `exclude` <Tag variant="new">3.3</Tag> | String names of Doc attributes to exclude. Supported: `spans`, `tensor`, `user_data`. ~~Iterable[str]~~           |
+| **RETURNS**                            | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. ~~Optional[Doc]~~ |
 
 ## Doc.to_disk {#to_disk tag="method" new="2"}
 

From b3717ba53a15a333c627526caa020fdfc44eb747 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 27 Apr 2022 09:14:25 +0200
Subject: [PATCH 119/123] removing print statements from the test suite
 (#10712)

---
 spacy/tests/lang/tr/test_tokenizer.py      | 1 -
 spacy/tests/pipeline/test_morphologizer.py | 2 +-
 spacy/tests/test_cli.py                    | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/spacy/tests/lang/tr/test_tokenizer.py b/spacy/tests/lang/tr/test_tokenizer.py
index 2ceca5068..9f988eae9 100644
--- a/spacy/tests/lang/tr/test_tokenizer.py
+++ b/spacy/tests/lang/tr/test_tokenizer.py
@@ -694,5 +694,4 @@ TESTS = ABBREV_TESTS + URL_TESTS + NUMBER_TESTS + PUNCT_TESTS + GENERAL_TESTS
 def test_tr_tokenizer_handles_allcases(tr_tokenizer, text, expected_tokens):
     tokens = tr_tokenizer(text)
     token_list = [token.text for token in tokens if not token.is_space]
-    print(token_list)
     assert expected_tokens == token_list
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 11d6f0477..33696bfd8 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -184,7 +184,7 @@ def test_overfitting_IO():
                 token.pos_ = ""
             token.set_morph(None)
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
-    print(nlp.get_pipe("morphologizer").labels)
+    assert nlp.get_pipe("morphologizer").labels is not None
     for i in range(50):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 5e431d5cb..26a5710a8 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -217,7 +217,6 @@ def test_cli_converters_conllu_to_docs_subtokens():
     sent = converted[0]["paragraphs"][0]["sentences"][0]
     assert len(sent["tokens"]) == 4
     tokens = sent["tokens"]
-    print(tokens)
     assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."]
     assert [t["tag"] for t in tokens] == [
         "NOUN__Definite=Ind|Gender=Masc|Number=Sing",

From c066fb8a4ee27bf0f90a60a863158ab12fc05fb3 Mon Sep 17 00:00:00 2001
From: harmbuisman <harmbuisman@gmail.com>
Date: Wed, 27 Apr 2022 09:51:58 +0200
Subject: [PATCH 120/123] #10672: fixes displacy output for manual unsorted
 entities (#10673)

* #10672: fixes displacy output for manual unsorted entities

* #10672: removed unused import

* fix prettier formatting

Co-authored-by: Harm Buisman <h.buisman@iknl.nl>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 .pre-commit-config.yaml           |  1 +
 spacy/displacy/__init__.py        |  6 +++++-
 spacy/tests/test_displacy.py      | 15 +++++++++++++++
 website/docs/api/top-level.md     |  2 +-
 website/docs/usage/visualizers.md |  4 +---
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a7a12fd24..bd1baf5f7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,6 +4,7 @@ repos:
     hooks:
     - id: black
       language_version: python3.7
+      additional_dependencies: ['click==8.0.4']
 -   repo: https://gitlab.com/pycqa/flake8
     rev: 3.9.2
     hooks:
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index aa00c95d8..5d49b6eb7 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -4,7 +4,7 @@ spaCy's built in visualization suite for dependencies and named entities.
 DOCS: https://spacy.io/api/top-level#displacy
 USAGE: https://spacy.io/usage/visualizers
 """
-from typing import List, Union, Iterable, Optional, Dict, Any, Callable
+from typing import Union, Iterable, Optional, Dict, Any, Callable
 import warnings
 
 from .render import DependencyRenderer, EntityRenderer, SpanRenderer
@@ -56,6 +56,10 @@ def render(
     renderer_func, converter = factories[style]
     renderer = renderer_func(options=options)
     parsed = [converter(doc, options) for doc in docs] if not manual else docs  # type: ignore
+    if manual:
+        for doc in docs:
+            if isinstance(doc, dict) and "ents" in doc:
+                doc["ents"] = sorted(doc["ents"], key=lambda x: (x["start"], x["end"]))
     _html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip()  # type: ignore
     html = _html["parsed"]
     if RENDER_WRAPPER is not None:
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index f52c36889..ccc145b44 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -338,3 +338,18 @@ def test_displacy_options_case():
     assert "green" in result[1] and "bar" in result[1]
     assert "red" in result[2] and "FOO" in result[2]
     assert "green" in result[3] and "BAR" in result[3]
+
+
+@pytest.mark.issue(10672)
+def test_displacy_manual_sorted_entities():
+    doc = {
+        "text": "But Google is starting from behind.",
+        "ents": [
+            {"start": 14, "end": 22, "label": "SECOND"},
+            {"start": 4, "end": 10, "label": "FIRST"},
+        ],
+        "title": None,
+    }
+
+    html = displacy.render(doc, style="ent", manual=True)
+    assert html.find("FIRST") < html.find("SECOND")
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 6d7431f28..f2fd1415f 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -263,7 +263,7 @@ Render a dependency parse tree or named entity visualization.
 
 | Name        | Description                                                                                                                                                                            |
 | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`      | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~                                                                                                  |
+| `docs`      | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~                                                                                      |
 | `style`     | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~                                                                                                                  |
 | `page`      | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                                          |
 | `minify`    | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                                      |
diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md
index f98c43224..770448c5a 100644
--- a/website/docs/usage/visualizers.md
+++ b/website/docs/usage/visualizers.md
@@ -342,9 +342,7 @@ want to visualize output from other libraries, like [NLTK](http://www.nltk.org)
 or
 [SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
 If you set `manual=True` on either `render()` or `serve()`, you can pass in data
-in displaCy's format (instead of `Doc` objects). When setting `ents` manually,
-make sure to supply them in the right order, i.e. starting with the lowest start
-position.
+in displaCy's format as a dictionary (instead of `Doc` objects).
 
 > #### Example
 >

From 3579507ba102f6a1502ea634e364d955bd448f11 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 27 Apr 2022 14:49:24 +0200
Subject: [PATCH 121/123] Bumped black to 22.3.0 due to a fix for
 https://github.com/psf/black/issues/2964. (#10715)

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bd1baf5f7..b959262e3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 -   repo: https://github.com/ambv/black
-    rev: 21.6b0
+    rev: 22.3.0
     hooks:
     - id: black
       language_version: python3.7

From 10377fb945cc2eba222b2fd6e68db040669cc149 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 28 Apr 2022 13:07:49 +0200
Subject: [PATCH 122/123] Set version to v3.3.0 (#10614)

* Set version to v3.3.0

* Revert "Temporarily skip tests that require models/compat"

This reverts commit e422101e004a6211d5b05942c36698287d545383.
---
 .github/azure-steps.yml | 34 +++++++++++++++++-----------------
 spacy/about.py          |  2 +-
 spacy/tests/test_cli.py |  1 -
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 742182bbe..80c88b0b8 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -64,12 +64,12 @@ steps:
     displayName: "Run GPU tests"
     condition: eq(${{ parameters.gpu }}, true)
 
-#  - script: |
-#      python -m spacy download ca_core_news_sm
-#      python -m spacy download ca_core_news_md
-#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-#    displayName: 'Test download CLI'
-#    condition: eq(variables['python_version'], '3.8')
+  - script: |
+      python -m spacy download ca_core_news_sm
+      python -m spacy download ca_core_news_md
+      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+    displayName: 'Test download CLI'
+    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@@ -93,17 +93,17 @@ steps:
     displayName: 'Test train CLI'
     condition: eq(variables['python_version'], '3.8')
 
-#  - script: |
-#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-#    displayName: 'Test assemble CLI'
-#    condition: eq(variables['python_version'], '3.8')
-#
-#  - script: |
-#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-#    displayName: 'Test assemble CLI vectors warning'
-#    condition: eq(variables['python_version'], '3.8')
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+    displayName: 'Test assemble CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+    displayName: 'Test assemble CLI vectors warning'
+    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python .github/validate_universe_json.py website/meta/universe.json
diff --git a/spacy/about.py b/spacy/about.py
index 1985ba342..03eabc2e9 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.3.0.dev0"
+__version__ = "3.3.0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 26a5710a8..0fa6f5670 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -584,7 +584,6 @@ def test_download_compatibility():
         assert get_minor_version(about.__version__) == get_minor_version(version)
 
 
-@pytest.mark.skip(reason="Temporarily skip for dev version")
 def test_validate_compatibility_table():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False

From 497a708c7130cda95fd08b678cb7adc109ebde0e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 28 Apr 2022 14:09:35 +0200
Subject: [PATCH 123/123] Docs for v3.3 (#10628)

* Temporarily disable CI tests

* Start v3.3 website updates

* Add trainable lemmatizer to pipeline design

* Fix Vectors.most_similar

* Add floret vector info to pipeline design

* Add Lower and Upper Sorbian

* Add span to sidebar

* Work on release notes

* Copy from release notes

* Update pipeline design graphic

* Upgrading note about Doc.from_docs

* Add tables and details

* Update website/docs/models/index.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix da lemma acc

* Add minimal intro, various updates

* Round lemma acc

* Add section on floret / word lists

* Add new pipelines table, minor edits

* Fix displacy spans example title

* Clarify adding non-trainable lemmatizer

* Update adding-languages URLs

* Revert "Temporarily disable CI tests"

This reverts commit 1dee505920783dfad56282267c29e7f1209f5131.

* Spell out words/sec

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/docs/api/doc.md                 |   2 +-
 website/docs/api/span.md                |  16 +-
 website/docs/api/vectors.md             |  16 +-
 website/docs/images/pipeline-design.svg | 103 +++++-----
 website/docs/models/index.md            |  85 ++++++--
 website/docs/usage/v3-3.md              | 247 ++++++++++++++++++++++++
 website/docs/usage/visualizers.md       |   3 +-
 website/meta/languages.json             |  10 +
 website/meta/sidebars.json              |   3 +-
 website/src/templates/index.js          |   4 +-
 10 files changed, 407 insertions(+), 82 deletions(-)
 create mode 100644 website/docs/usage/v3-3.md

diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index c929a4a06..0008cde31 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -621,7 +621,7 @@ relative clauses.
 
 To customize the noun chunk iterator in a loaded pipeline, modify
 [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
-[syntax iterator](/usage/adding-languages#language-data) has not been
+[syntax iterator](/usage/linguistic-features#language-data) has not been
 implemented for the given language, a `NotImplementedError` is raised.
 
 > #### Example
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index ff7905bc0..d765a199c 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -283,8 +283,9 @@ objects, if the document has been syntactically parsed. A base noun phrase, or
 it – so no NP-level coordination, no prepositional phrases, and no relative
 clauses.
 
-If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
-not been implemeted for the given language, a `NotImplementedError` is raised.
+If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
+has not been implemeted for the given language, a `NotImplementedError` is
+raised.
 
 > #### Example
 >
@@ -520,12 +521,13 @@ sent = doc[sent.start : max(sent.end, span.end)]
 
 ## Span.sents {#sents tag="property" model="sentences" new="3.2.1"}
 
-Returns a generator over the sentences the span belongs to. This property is only available
-when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
-document by the `parser`, `senter`, `sentencizer` or some custom function. It
-will raise an error otherwise.
+Returns a generator over the sentences the span belongs to. This property is
+only available when [sentence boundaries](/usage/linguistic-features#sbd) have
+been set on the document by the `parser`, `senter`, `sentencizer` or some custom
+function. It will raise an error otherwise.
 
-If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned.
+If the span happens to cross sentence boundaries, all sentences the span
+overlaps with will be returned.
 
 > #### Example
 >
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index a651c23b0..9636ea04c 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -347,14 +347,14 @@ supported for `floret` mode.
 > most_similar = nlp.vocab.vectors.most_similar(queries, n=10)
 > ```
 
-| Name           | Description                                                                 |
-| -------------- | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
-| `queries`      | An array with one or more vectors. ~~numpy.ndarray~~                        |
-| _keyword-only_ |                                                                             |
-| `batch_size`   | The batch size to use. Default to `1024`. ~~int~~                           |
-| `n`            | The number of entries to return for each query. Defaults to `1`. ~~int~~    |
-| `sort`         | Whether to sort the entries returned by score. Defaults to `True`. ~~bool~~ |
-| **RETURNS**    | tuple                                                                       | The most similar entries as a `(keys, best_rows, scores)` tuple. ~~Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]~~ |
+| Name           | Description                                                                                                             |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------- |
+| `queries`      | An array with one or more vectors. ~~numpy.ndarray~~                                                                    |
+| _keyword-only_ |                                                                                                                         |
+| `batch_size`   | The batch size to use. Default to `1024`. ~~int~~                                                                       |
+| `n`            | The number of entries to return for each query. Defaults to `1`. ~~int~~                                                |
+| `sort`         | Whether to sort the entries returned by score. Defaults to `True`. ~~bool~~                                             |
+| **RETURNS**    | The most similar entries as a `(keys, best_rows, scores)` tuple. ~~Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]~~ |
 
 ## Vectors.get_batch {#get_batch tag="method" new="3.2"}
 
diff --git a/website/docs/images/pipeline-design.svg b/website/docs/images/pipeline-design.svg
index 88ccdab99..3b528eae5 100644
--- a/website/docs/images/pipeline-design.svg
+++ b/website/docs/images/pipeline-design.svg
@@ -1,49 +1,56 @@
-<svg xmlns="http://www.w3.org/2000/svg" width="837" height="259" viewBox="0 0 837 259">
-  <defs>
-    <linearGradient id="a" x1="60.6%" x2="50%" y1="100%" y2="100%">
-      <stop offset="0%" stop-color="#B2D73A"/>
-      <stop offset="100%" stop-color="#F0A439"/>
-    </linearGradient>
-    <linearGradient id="b" x1="60.6%" x2="50%" y1="100%" y2="100%">
-      <stop offset="0%" stop-color="#CDB217"/>
-      <stop offset="100%" stop-color="#F0A439"/>
-    </linearGradient>
-    <linearGradient id="c" x1="100%" x2="0%" y1="50%" y2="50%">
-      <stop offset="0%" stop-color="#3AD787"/>
-      <stop offset="100%" stop-color="#CDB217"/>
-    </linearGradient>
-    <linearGradient id="d" x1="100%" x2="0%" y1="50%" y2="50%">
-      <stop offset="0%" stop-color="#3A8DD7"/>
-      <stop offset="100%" stop-color="#3AD787"/>
-    </linearGradient>
-  </defs>
-  <g fill="none" fill-rule="evenodd">
-    <path fill="#F2D7B2" stroke="#F0A439" stroke-linejoin="round" stroke-width="3.8" d="M27 90h148.6l23.4 40.9-23.4 39.1H27l23.4-39z"/>
-    <path fill="#F2E7A6" stroke="#CDB217" stroke-linejoin="round" stroke-width="3.8" d="M161 90h134.6l23.4 40.9-23.4 39.1H161l23.4-39z"/>
-    <path fill="#D7E99A" stroke="#B2D73A" stroke-linejoin="round" stroke-width="3.8" d="M286 90h134.6l23.4 40.9-23.4 39.1H286l23.4-39z"/>
-    <path fill="#B5F3D4" stroke="#3AD787" stroke-linejoin="round" stroke-width="3.8" d="M417 90h134.6l23.4 40.9-23.4 39.1H417l23.4-39z"/>
-    <path fill="#B5D8F3" stroke="#3A8DD7" stroke-linejoin="round" stroke-width="3.8" d="M537 90h134.6l23.4 40.9-23.4 39.1H537l23.4-39z"/>
-    <rect width="100.5" height="23.5" x="4.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="11.8" transform="translate(169 156)"/>
-    <rect width="100.5" height="23.5" x="4.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="11.8" transform="translate(296 156)"/>
-    <rect width="47.5" height="16.5" x="1.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="8.3" transform="translate(453 81)"/>
-    <path fill="#FFF" fill-rule="nonzero" d="M462 96v-4.8c0-.5.2-.9.4-1.1.3-.3.6-.5 1-.5.6 0 1 .2 1.2.5.3.2.4.6.4 1.1v.4h1.9v-.5a3 3 0 00-.7-2.2c-.5-.5-1.2-.7-2-.7-.6 0-1 0-1.4.3-.4.3-.6.7-.7 1.1h-.1v-1.3h-1.7V96h1.8zm9.2.1c1 0 1.8-.2 2.3-.8a3 3 0 00.8-2.2v-4.8h-1.7v4.8c0 1-.5 1.5-1.4 1.5-1 0-1.4-.5-1.4-1.5v-4.8H468v4.8c0 1 .3 1.7.9 2.2.5.6 1.3.8 2.3.8zm11.6-.1v-1.6h-2.3c-.2 0-.5 0-.6-.2a.9.9 0 01-.3-.7V86h-4.2v1.5h2.5v6.1c0 .8.2 1.4.7 1.8.4.5 1 .7 1.8.7h2.4zm4.4.1l1.2-.1 1-.4c.2-.2.5-.4.6-.7l.5-.8h-1.8c0 .2-.3.3-.5.5l-1 .1c-.5 0-.9-.1-1.2-.4-.3-.3-.4-.7-.4-1.2v-.5h5v-1.4c0-.5-.2-.9-.3-1.3l-.7-1a3 3 0 00-1-.5 4 4 0 00-1.4-.2c-.5 0-1 0-1.4.2a3 3 0 00-1 .6c-.3.2-.5.6-.7 1-.2.3-.2.7-.2 1.2v2a2.7 2.7 0 002 2.8l1.3.1zm1.6-4.7h-3.2v-.2c0-.5.1-1 .4-1.2.3-.3.7-.4 1.2-.4s.9.1 1.2.4c.3.3.4.7.4 1.2v.2zm6.7 4.7c1 0 1.7-.2 2.3-.6.5-.4.8-1 .8-1.7a2 2 0 00-.7-1.7c-.4-.3-1.1-.6-2-.7l-1.1-.2c-.7 0-1-.3-1-.8 0-.6.3-.9 1.2-.9h.5c.3 0 .6 0 .8.2.2.1.4.3.4.5h1.8c-.1-.6-.4-1.1-1-1.5-.4-.4-1.1-.5-2-.5h-.5c-1 0-1.7.2-2.2.5a2 2 0 00-.8 1.7c0 .7.2 1.3.6 1.6.4.4 1.1.7 2 .8h1c.9.2 1.3.5 1.3 1 0 .3-.2.6-.4.7-.2.2-.6.3-1 .3h-.6c-.3 0-.6 0-.8-.2a.9.9 0 01-.5-.6h-1.8c.1.7.4 1.2 1 1.6.5.4 1.2.5 2.1.5h.6z"/>
-    <rect width="47.5" height="16.5" x="1.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="8.3" transform="translate(586 81)"/>
-    <path fill="#FFF" fill-rule="nonzero" d="M595 96v-4.8c0-.5.2-.9.4-1.1.3-.3.6-.5 1-.5.6 0 1 .2 1.2.5.3.2.4.6.4 1.1v.4h1.9v-.5a3 3 0 00-.7-2.2c-.5-.5-1.2-.7-2-.7-.6 0-1 0-1.4.3-.4.3-.6.7-.7 1.1h-.1v-1.3h-1.7V96h1.8zm9.2.1c1 0 1.8-.2 2.3-.8a3 3 0 00.8-2.2v-4.8h-1.7v4.8c0 1-.5 1.5-1.4 1.5-1 0-1.4-.5-1.4-1.5v-4.8H601v4.8c0 1 .3 1.7.9 2.2.5.6 1.3.8 2.3.8zm11.6-.1v-1.6h-2.3c-.2 0-.5 0-.6-.2a.9.9 0 01-.3-.7V86h-4.2v1.5h2.5v6.1c0 .8.2 1.4.7 1.8.4.5 1 .7 1.8.7h2.4zm4.4.1l1.2-.1 1-.4c.2-.2.5-.4.6-.7l.5-.8h-1.8c0 .2-.3.3-.5.5l-1 .1c-.5 0-.9-.1-1.2-.4-.3-.3-.4-.7-.4-1.2v-.5h5v-1.4c0-.5-.2-.9-.3-1.3l-.7-1a3 3 0 00-1-.5 4 4 0 00-1.4-.2c-.5 0-1 0-1.4.2a3 3 0 00-1 .6c-.3.2-.5.6-.7 1-.2.3-.2.7-.2 1.2v2a2.7 2.7 0 002 2.8l1.3.1zm1.6-4.7h-3.2v-.2c0-.5.1-1 .4-1.2.3-.3.7-.4 1.2-.4s.9.1 1.2.4c.3.3.4.7.4 1.2v.2zm6.7 4.7c1 0 1.7-.2 2.3-.6.5-.4.8-1 .8-1.7a2 2 0 00-.7-1.7c-.4-.3-1.1-.6-2-.7l-1.1-.2c-.7 0-1-.3-1-.8 0-.6.3-.9 1.2-.9h.5c.3 0 .6 0 .8.2.2.1.4.3.4.5h1.8c-.1-.6-.4-1.1-1-1.5-.4-.4-1.1-.5-2-.5h-.5c-1 0-1.7.2-2.2.5a2 2 0 00-.8 1.7c0 .7.2 1.3.6 1.6.4.4 1.1.7 2 .8h1c.9.2 1.3.5 1.3 1 0 .3-.2.6-.4.7-.2.2-.6.3-1 .3h-.6c-.3 0-.6 0-.8-.2a.9.9 0 01-.5-.6h-1.8c.1.7.4 1.2 1 1.6.5.4 1.2.5 2.1.5h.6z"/>
-    <path stroke="url(#a)" stroke-linecap="square" stroke-width="4" d="M264 7v39.6L8 46.7V30.9" transform="translate(87 178)"/>
-    <path stroke="url(#b)" stroke-linecap="square" stroke-width="4" d="M137 7v27.6L8 34.7V18.9" transform="translate(87 178)"/>
-    <path fill="#F0A439" d="M95 179l8 16H87z"/>
-    <path fill="#3D4251" fill-rule="nonzero" d="M207.3 133.2l1.8-.3c.6-.2 1-.5 1.5-.8l-1-1.5v-.2l-.3-.1h-.2a4.4 4.4 0 01-.6.3l-.5.1a1 1 0 01-.8-.3c-.2-.3-.3-.6-.3-1v-7h3.4v-2.1h-3.4v-4h-2.1l-.2.5-.7 3.5-2 .3v1.2c0 .3 0 .4.2.5l.4.2h1.3v7.2c0 1 .3 2 .9 2.6.6.6 1.4.9 2.6.9zm8.5 0l1.3-.1a5 5 0 002-1l1-.7.2 1 .4.5.7.1h1.4v-8.1a6 6 0 00-.4-2c-.2-.6-.5-1.1-.9-1.6-.4-.4-.9-.7-1.5-1-.6-.2-1.2-.4-2-.4-2 0-3.8.7-5.3 2l.5 1 .4.4c.2.2.3.2.5.2.3 0 .5 0 .7-.2a31.8 31.8 0 001.6-.8l1.2-.1c.7 0 1.3.2 1.6.6.4.4.6 1 .6 1.9v.8c-1.5 0-2.7.1-3.7.4-1 .2-1.8.5-2.4 1-.6.3-1 .7-1.2 1.2a3 3 0 00-.4 1.4c0 .6 0 1.1.3 1.5a3 3 0 002 1.8l1.4.2zm1-2.1a2 2 0 01-1.3-.4c-.3-.2-.5-.6-.5-1.1 0-.3.1-.6.3-.8l.7-.6a6 6 0 011.5-.4 17 17 0 012.3-.3v2.2l-.7.6a3.3 3.3 0 01-1.4.7h-1zm13.5 6.4c1 0 1.8-.1 2.6-.4.8-.2 1.4-.6 2-1 .5-.4.9-.9 1.2-1.4.2-.5.4-1 .4-1.6 0-.7-.1-1.2-.4-1.5-.2-.4-.5-.7-1-1l-1.2-.5a12 12 0 00-3-.3h-1.4a3 3 0 01-1-.3.6.6 0 01-.3-.6c0-.3.2-.5.6-.8a7.2 7.2 0 003.4-.1c.7-.2 1.2-.5 1.6-.9a3.9 3.9 0 001.2-4.4l1.1-.3c.3 0 .5 0 .6-.2l.1-.4v-1.2h-3.7l-1.3-.5a6.8 6.8 0 00-5.2 1 3.9 3.9 0 00-1.4 3 3.7 3.7 0 002 3.5 3.6 3.6 0 00-1.2 1l-.3.6-.1.6c0 .5 0 .8.3 1.1.2.3.4.6.7.7-.6.3-1.1.6-1.5 1-.4.4-.5 1-.5 1.5s0 .9.3 1.3c.2.4.6.8 1 1 .5.4 1.1.6 1.8.8.8.2 1.6.3 2.6.3zm0-11.2a3 3 0 01-1-.2 1.8 1.8 0 01-1.1-1.1c-.2-.2-.2-.5-.2-.8 0-.7.2-1.2.5-1.5.4-.4 1-.6 1.7-.6.8 0 1.4.2 1.8.6.3.3.5.8.5 1.5l-.1.8a1.8 1.8 0 01-1.2 1.1l-1 .2zm0 9l-1.4-.1c-.4 0-.7-.2-1-.3l-.5-.5-.2-.7c0-.3.1-.6.4-.9.2-.2.4-.5.8-.7a15.1 15.1 0 002.4.2 16.7 16.7 0 012 .3c.3 0 .5.2.6.4.2.1.2.3.2.6 0 .2 0 .5-.2.7 0 .2-.3.4-.6.5-.2.2-.6.3-1 .4l-1.5.1zm13 2.2c.9 0 1.8-.1 2.5-.4.8-.2 1.5-.6 2-1s1-.9 1.2-1.4c.3-.5.4-1 .4-1.6 0-.7-.1-1.2-.3-1.5l-1-1-1.3-.5a12 12 0 00-3-.3h-1.3a3 3 0 01-1-.3.6.6 0 01-.3-.6c0-.3.2-.5.6-.8a7.2 7.2 0 003.4-.1c.6-.2 1.1-.5 1.6-.9a3.9 3.9 0 001.1-4.4l1.2-.3c.2 0 .4 0 .5-.2l.1-.4v-1.2h-3.6l-1.4-.5a6.8 6.8 0 00-5.2 1 3.9 3.9 0 00-1.4 3 3.7 3.7 0 002 3.5 3.6 3.6 0 00-1.2 1l-.3.6v.6c0 .5 0 .8.2 1.1.2.3.5.6.8.7-.7.3-1.2.6-1.6 1-.3.4-.5 1-.5 1.5s.1.9.3 1.3c.3.4.6.8 1.1 1 .5.4 1 .6 1.8.8.7.2 1.5.3 2.5.3zm-.1-11.2a3 3 0 01-1-.2 1.8 1.8 0 01-1.1-1.1l-.2-.8c0-.7.2-1.2.6-1.5.4-.4 1-.6 1.7-.6s1.3.2 1.7.6c.4.3.6.8.6 1.5 0 .3 0 .6-.2.8a1.8 1.8 0 01-1.1 1.1l-1 .2zm0 9l-1.4-.1c-.4 0-.7-.2-1-.3l-.5-.5-.1-.7c0-.3 0-.6.3-.9l.8-.7a15.1 15.1 0 002.4.2 16.7 16.7 0 012 .3c.3 0 .5.2.6.4.2.1.3.3.3.6 0 .2 0 .5-.2.7-.1.2-.3.4-.6.5l-1 .4-1.5.1zm14-2.1a10.1 10.1 0 002.7-.5 5.1 5.1 0 002.5-1.7l-.9-1-.2-.3h-.3c-.2 0-.4 0-.6.2a22 22 0 01-1.7.7l-1.3.2c-1 0-1.8-.3-2.4-1-.7-.5-1-1.5-1.1-2.8h8.3l.2-.3.1-.3v-.6c0-1-.1-1.8-.4-2.5-.3-.7-.6-1.3-1.1-1.8-.5-.5-1.1-.9-1.8-1.1-.7-.3-1.4-.4-2.2-.4-1 0-1.8.1-2.6.5a5.7 5.7 0 00-3.2 3.3c-.2.8-.4 1.6-.4 2.5 0 1.1.2 2.1.5 3a5.6 5.6 0 003.3 3.5c.8.2 1.6.4 2.5.4zm2.6-8H254c.1-1 .4-1.7 1-2.3.4-.5 1.1-.7 2-.7.5 0 .9 0 1.2.2l.9.6.5 1 .2 1.1zm8.3 7.8v-8a4 4 0 011.2-1.5c.5-.4 1-.6 1.6-.6l1 .1.4.1h.4l.1-.4.4-2.3c-.4-.3-1-.5-1.6-.5-.7 0-1.4.3-2 .7a6 6 0 00-1.6 1.9l-.2-1.5c0-.3-.2-.5-.3-.7l-.6-.1H265V133h3.1zM71 138.2l1.8-.3c.6-.2 1-.5 1.5-.8l-.9-1.5-.2-.2-.2-.1h-.2a4.4 4.4 0 01-.6.3l-.5.1a1 1 0 01-.8-.3c-.2-.3-.3-.6-.3-1v-7H74v-2.1h-3.4v-4h-2.1l-.2.5-.6 3.5-2.1.3v1.2c0 .3 0 .4.2.5l.4.2h1.3v7.2c0 1 .3 2 .9 2.6.6.6 1.5.9 2.6.9zm10.9 0a7 7 0 002.6-.5 5.5 5.5 0 003.3-3.4 8 8 0 00.4-2.7c0-1-.1-2-.4-2.8-.3-.8-.8-1.5-1.3-2-.5-.6-1.2-1-2-1.4a7 7 0 00-2.6-.4 7 7 0 00-2.6.4 5.7 5.7 0 00-3.3 3.4c-.3.8-.5 1.8-.5 2.8 0 1 .2 1.9.5 2.7a5.6 5.6 0 003.3 3.4 7 7 0 002.6.5zm0-2.4c-1 0-1.9-.3-2.4-1-.5-.8-.8-1.8-.8-3.2 0-1.4.3-2.4.8-3.2.5-.7 1.3-1 2.4-1 1 0 1.9.3 2.4 1 .5.8.7 1.8.7 3.2 0 1.4-.2 2.4-.7 3.1-.5.8-1.3 1.1-2.4 1.1zm11.9 2.2v-5.9h.7l.5.1c.2 0 .3.2.4.4l3.3 4.8c.1.3.2.4.4.5l.6.1h2.8l-4.3-6.2a10 10 0 00-.4-.6l-.4-.4.5-.4.4-.5 4-4.7h-2.8l-.7.1-.4.4-3.2 4-.4.3h-1v-10.6h-3.1V138h3zm22.4 0v-1.8a1 1 0 00-.3-.8c-.2-.2-.4-.3-.8-.3h-4.6l-1.1.1-1.2.3 4.1-4.2a72.2 72.2 0 002.5-3c.4-.5.6-1 .8-1.6a5.6 5.6 0 000-4 4.7 4.7 0 00-3-2.6 7.4 7.4 0 00-4.5 0 5.6 5.6 0 00-3.2 2.6c-.4.6-.6 1.4-.8 2.3l1.7.3h.4c.3 0 .5 0 .7-.2l.4-.7a3 3 0 011-1.5c.5-.4 1-.6 1.8-.6.4 0 .8 0 1.1.2a2.3 2.3 0 011.4 1.3l.2 1.2-.2 1.3c-.2.5-.3.9-.6 1.3a26 26 0 01-2.1 2.6l-5.5 5.5a1.8 1.8 0 00-.6 1.3v1h12.4zm9.1 0l5.1-12.8H128l-.5.1c-.2.1-.3.3-.3.5l-2.6 6.8a10.8 10.8 0 00-.6 2.3 22.8 22.8 0 00-.6-2.3l-2.5-6.8c0-.2-.2-.4-.3-.5a1 1 0 00-.6-.1h-2.5l5 12.8h2.8zm12.4.2a10.1 10.1 0 002.8-.5c.5-.1 1-.3 1.3-.6.5-.3.9-.6 1.2-1l-.9-1.1-.2-.3h-.4c-.2 0-.4 0-.6.2a22 22 0 01-1.6.7l-1.3.2c-1 0-1.8-.3-2.5-1-.6-.5-1-1.5-1-2.8H142.7l.3-.3v-.3l.1-.6c0-1-.1-1.8-.4-2.5-.3-.7-.7-1.3-1.1-1.8-.5-.5-1.1-.9-1.8-1.1-.7-.3-1.4-.4-2.2-.4-1 0-1.9.1-2.6.5a5.7 5.7 0 00-3.2 3.3c-.3.8-.4 1.6-.4 2.5 0 1.1.2 2.1.5 3a5.6 5.6 0 003.3 3.5c.8.2 1.6.4 2.5.4zm2.6-8h-5.8c.2-1 .5-1.7 1-2.2.5-.6 1.2-.8 2.1-.8.5 0 .9 0 1.2.2l.9.7c.2.2.4.5.5.9l.1 1.1zm10.2 8a9.5 9.5 0 002.7-.5 6 6 0 002.4-1.6l-1-1.1a.6.6 0 00-.5-.3c-.2 0-.3 0-.5.2l-.5.4-.8.4-1.2.1c-.5 0-1 0-1.3-.2-.4-.2-.8-.5-1-.9-.3-.3-.5-.8-.7-1.3a7 7 0 01-.2-1.8c0-.7 0-1.3.2-1.8l.6-1.3c.3-.4.6-.7 1-.9l1.5-.3a3.4 3.4 0 011.8.5l.5.3.5.2.4-.1.2-.3.8-1.1c-.5-.6-1.2-1-1.9-1.4-.7-.3-1.6-.4-2.5-.4-1 0-2 .1-2.7.5a5.5 5.5 0 00-3.1 3.4c-.3.8-.4 1.7-.4 2.7 0 1 .1 2 .4 2.8.3.8.7 1.5 1.3 2 .5.6 1 1 1.8 1.3.7.3 1.4.5 2.2.5zM330.7 137.2v-5.3l1.4 1c.5.2 1.1.3 1.8.3a5 5 0 004-2c.5-.5.9-1.2 1.2-2l.3-2.7c0-1 0-2-.3-2.8a6 6 0 00-1-2c-.4-.6-.9-1-1.5-1.3-.5-.3-1.2-.5-1.9-.5-.9 0-1.7.2-2.3.6-.7.4-1.3.8-1.8 1.4l-.3-1.2c0-.2-.1-.3-.3-.4a1 1 0 00-.5-.1h-1.9v17h3.1zm2.5-6.4c-.5 0-1 0-1.3-.2a3 3 0 01-1.2-1V124c.4-.5.9-.8 1.3-1.1a3.2 3.2 0 012.7-.2l.9.7.5 1.3a8.6 8.6 0 010 3.8 4 4 0 01-.7 1.4c-.2.3-.5.6-1 .8a3 3 0 01-1.2.2zm11.7 2.4l1.2-.1a5 5 0 002-1c.4-.1.7-.4 1-.7l.3 1c0 .2.2.4.4.5l.6.1h1.4v-8.1a6 6 0 00-.3-2c-.2-.6-.5-1.1-1-1.6-.3-.4-.8-.7-1.4-1-.6-.2-1.3-.4-2-.4-2.1 0-3.9.7-5.4 2l.6 1 .4.4c.1.2.3.2.5.2.3 0 .5 0 .7-.2a31.8 31.8 0 001.6-.8l1.2-.1c.7 0 1.2.2 1.6.6.3.4.5 1 .5 1.9v.8c-1.4 0-2.7.1-3.6.4-1 .2-1.8.5-2.4 1-.6.3-1 .7-1.3 1.2a3 3 0 00-.4 1.4c0 .6.1 1.1.3 1.5a3 3 0 002 1.8l1.5.2zm1-2.1a2 2 0 01-1.3-.4c-.3-.2-.5-.6-.5-1.1 0-.3 0-.6.2-.8l.8-.6a6 6 0 011.5-.4 17 17 0 012.2-.3v2.2l-.6.6a3.3 3.3 0 01-1.4.7h-1zm12 1.9v-8a4 4 0 011.2-1.5c.5-.4 1-.6 1.7-.6l.8.1.5.1h.4l.1-.4.4-2.3c-.4-.3-.9-.5-1.5-.5-.8 0-1.5.3-2.1.7a6 6 0 00-1.6 1.9l-.2-1.5c0-.3-.1-.5-.3-.7l-.6-.1h-1.8V133h3zm10.5.2a7 7 0 002.2-.3c.6-.2 1.2-.5 1.6-1 .4-.3.8-.7 1-1.3.2-.5.3-1 .3-1.7 0-.5 0-1-.2-1.3a3 3 0 00-.7-1 4 4 0 00-1-.6 12.5 12.5 0 00-2.4-.8 7 7 0 01-1-.4l-.7-.5a1 1 0 01-.2-.7c0-.4.1-.7.5-1 .3-.2.8-.4 1.4-.4a3.7 3.7 0 011.8.4 32.9 32.9 0 011 .4h.4l.3-.3.7-1.1c-.5-.5-1-.9-1.8-1.2-.7-.3-1.5-.4-2.4-.4-.8 0-1.5 0-2.1.3-.6.2-1.1.5-1.5.8a3.5 3.5 0 00-1.3 2.8c0 .6.1 1 .3 1.4.2.4.4.8.7 1l1 .7a10.8 10.8 0 002.4.9l1 .3.7.6c.2.1.3.4.3.7 0 .2 0 .4-.2.6 0 .2-.2.3-.3.5a2 2 0 01-.7.3 3 3 0 01-1 .1 3.5 3.5 0 01-1.9-.5 12 12 0 01-.6-.3 1 1 0 00-.5-.2 1 1 0 00-.5.1 1 1 0 00-.3.3l-.7 1.2.8.7a6.7 6.7 0 002.3.8l1.3.1zm13 0a10.1 10.1 0 002.9-.5 5.1 5.1 0 002.5-1.7l-1-1-.2-.3h-.3c-.2 0-.4 0-.6.2a22 22 0 01-1.7.7l-1.3.2c-1 0-1.8-.3-2.4-1-.6-.5-1-1.5-1-2.8h8.2l.2-.3.1-.3v-.6c0-1 0-1.8-.4-2.5-.2-.7-.6-1.3-1.1-1.8-.5-.5-1-.9-1.8-1.1-.6-.3-1.4-.4-2.2-.4-1 0-1.8.1-2.6.5a5.7 5.7 0 00-3.1 3.3c-.3.8-.5 1.6-.5 2.5 0 1.1.2 2.1.5 3 .4.8.8 1.6 1.4 2.1.5.6 1.2 1 2 1.3.7.3 1.6.5 2.5.5zm2.7-8h-5.8c.1-1 .5-1.7 1-2.3.5-.5 1.2-.7 2-.7.5 0 1 0 1.3.2l.8.6c.3.3.4.6.5 1l.2 1.1zm8.4 7.8v-8a4 4 0 011.1-1.5c.5-.4 1-.6 1.7-.6l.8.1.5.1h.4l.1-.4.4-2.3c-.4-.3-.9-.5-1.5-.5-.8 0-1.5.3-2.1.7a6 6 0 00-1.6 1.9l-.2-1.5c0-.3-.1-.5-.3-.7l-.6-.1h-1.8V133h3zM454 128.2l1-.1a4 4 0 001.6-.7l.8-.7.2.8c0 .2.2.3.3.4l.5.1h1.2v-6.5c0-.6-.1-1-.3-1.6-.2-.5-.4-.9-.8-1.2-.3-.4-.7-.7-1.1-.9-.5-.2-1-.3-1.6-.3-1.7 0-3.1.6-4.3 1.7l.4.8.3.3.4.1.6-.1a25.4 25.4 0 001.2-.7 3 3 0 011-.1c.6 0 1 .2 1.3.5.3.3.4.8.4 1.5v.6c-1.1 0-2.1.2-2.9.4-.8.2-1.4.4-1.9.7-.5.3-.8.7-1 1l-.3 1.2c0 .4 0 .8.2 1.2a2.4 2.4 0 001.6 1.4l1.2.2zm.8-1.7c-.5 0-.8-.1-1-.3-.3-.2-.4-.5-.4-1l.1-.6.7-.4 1.1-.4 1.8-.2v1.8a5 5 0 01-.5.4l-.5.4-.7.2h-.6zm10.6 1.7c.5 0 1-.1 1.4-.3.5-.1.9-.4 1.2-.7l-.7-1.2-.2-.1h-.1-.2a3.5 3.5 0 01-.5.2h-.4c-.2 0-.4 0-.6-.2a1 1 0 01-.3-.8v-5.5h2.8v-1.8H465v-3.2h-1.2l-.4.1-.2.3-.5 2.8-1.6.3v1.4h1.4v5.8c0 1 .3 1.6.8 2.1s1.2.8 2 .8zm7.7 0c.5 0 1-.1 1.4-.3.5-.1 1-.4 1.3-.7l-.8-1.2-.1-.1h-.2-.2a3.5 3.5 0 01-.4.2h-.4c-.3 0-.5 0-.7-.2a1 1 0 01-.2-.8v-5.5h2.7v-1.8h-2.7v-3.2h-1.3l-.4.1-.1.3-.6 2.8-1.6.3v1l.1.4h1.4v5.8c0 1 .3 1.6.7 2.1.5.5 1.2.8 2 .8zm6.8-.2v-6.4c.3-.5.6-1 1-1.2a2 2 0 011.3-.5l.7.1h.7v-.3l.4-1.8c-.3-.2-.8-.4-1.3-.4-.6 0-1.1.2-1.6.6-.5.4-1 .9-1.3 1.5l-.2-1.2c0-.3 0-.4-.2-.5 0-.1-.2-.2-.5-.2h-1.4V128h2.4zm7.1-11.7l.6-.1a1.6 1.6 0 00.9-.9l.1-.5-.1-.7a1.6 1.6 0 00-1.5-1l-.6.2a1.6 1.6 0 00-.8.8 1.5 1.5 0 00.8 2c.2.2.4.2.6.2zm1.3 11.7v-10.3h-2.5V128h2.5zm7.8.2a4 4 0 003.2-1.6l1-1.7a8.2 8.2 0 000-4.4l-.8-1.6-1.2-1c-.5-.2-1-.3-1.6-.3a4 4 0 00-1.8.4c-.5.2-1 .6-1.3 1v-5.9H491V128h1.6l.4-.1.2-.4.1-.7.5.6a3 3 0 002.2.8zm-.6-2l-1-.1c-.4-.2-.7-.4-1-.8v-4.6l1-.9a2.5 2.5 0 012.3-.1l.6.5.4 1a6.9 6.9 0 010 3.1c-.1.5-.3.8-.5 1.1a2 2 0 01-.8.7l-1 .2zm10.2 2a4.2 4.2 0 003.2-1.5l.1.8c.1.3.4.5.7.5h1.5v-10.3h-2.5v7.5l-1 .7c-.4.2-.8.3-1.3.3s-1-.1-1.3-.5c-.2-.3-.4-.8-.4-1.4v-6.6h-2.5v6.6l.3 1.5c.1.5.3 1 .6 1.3l1.1.8c.4.2 1 .3 1.5.3zm11.5 0c.5 0 1-.1 1.4-.3.5-.1 1-.4 1.3-.7l-.8-1.2-.1-.1h-.2-.2a3.5 3.5 0 01-.4.2h-.4c-.3 0-.5 0-.7-.2a1 1 0 01-.2-.8v-5.5h2.7v-1.8h-2.7v-3.2h-1.3l-.4.1-.1.3-.6 2.8-1.6.3v1l.1.4h1.4v5.8c0 1 .3 1.6.7 2.1.5.5 1.2.8 2 .8zm8.7 0a8 8 0 002.3-.4l1-.5 1-.8-.8-1-.2-.1h-.2a1 1 0 00-.5 0 17.7 17.7 0 01-1.3.7l-1 .1c-.9 0-1.5-.2-2-.7-.5-.5-.8-1.3-.9-2.3h6.7l.1-.2.1-.3v-.5c0-.7 0-1.4-.3-2-.2-.5-.5-1-1-1.4a4 4 0 00-1.3-1 5 5 0 00-1.8-.2c-.8 0-1.5.1-2 .4a4.6 4.6 0 00-2.6 2.7c-.2.6-.3 1.3-.3 2 0 .8.1 1.6.4 2.3a5 5 0 001 1.7c.5.5 1 .9 1.6 1 .6.3 1.3.5 2 .5zm2.1-6.5h-4.6a3 3 0 01.8-1.7 2.3 2.3 0 012.6-.4c.3 0 .5.2.7.4l.4.8.1.9zm10.8 9.2v-1.6H531v1.6h8zM476.4 148v-6.4c.2-.5.5-1 1-1.2a2 2 0 011.2-.5l.7.1h.7l.1-.3.3-1.8c-.3-.2-.7-.3-1.2-.3-.6 0-1.2.1-1.7.5s-.9.9-1.3 1.5l-.1-1.2c0-.3-.1-.4-.2-.5l-.5-.2h-1.5V148h2.5zm9 .2a4.2 4.2 0 003.1-1.5l.2.9c0 .3.3.4.6.4h1.5v-10.3h-2.5v7.5c-.3.3-.6.6-1 .7l-1.2.3c-.6 0-1-.1-1.3-.5-.3-.3-.5-.8-.5-1.4v-6.6H482v6.6c0 .5 0 1 .2 1.5.1.5.4 1 .7 1.3l1 .8 1.5.3zm10.7-.2v-14.9h-2.5V148h2.5zm7.1.2a8 8 0 002.3-.4l1-.5 1-.8-.8-1-.1-.1h-.3a1 1 0 00-.5 0 17.7 17.7 0 01-1.3.7l-1 .1c-.9 0-1.5-.2-2-.7-.5-.5-.8-1.3-.9-2.3h6.7l.1-.2.1-.3v-.5c0-.7 0-1.4-.3-2a4 4 0 00-2.3-2.4 5 5 0 00-1.8-.2c-.8 0-1.4.1-2 .4a4.6 4.6 0 00-2.6 2.7c-.2.6-.3 1.3-.3 2 0 .9.1 1.6.4 2.3a5 5 0 001 1.7c.5.5 1 .9 1.6 1 .7.3 1.3.5 2 .5zm2.1-6.5h-4.6a3 3 0 01.8-1.7c.4-.4 1-.7 1.7-.7l1 .2.6.5.4.8.1.9zm6.7 6.3v-6.4c.3-.5.6-1 1-1.2a2 2 0 011.3-.5l.7.1h.7v-.3l.4-1.8c-.3-.2-.8-.3-1.3-.3-.6 0-1.1.1-1.6.5s-1 .9-1.3 1.5l-.2-1.2c0-.3 0-.4-.2-.5 0-.1-.2-.2-.5-.2h-1.4V148h2.4zM576.6 135v-14.9h-2.5V135h2.5zm7.2.2a8 8 0 002.2-.4l1-.5 1-.8-.7-1-.2-.1h-.3a1 1 0 00-.4 0 17.7 17.7 0 01-1.4.7l-1 .1c-.8 0-1.5-.2-2-.7-.5-.5-.8-1.3-.8-2.3h6.6l.2-.2v-.3-.5c0-.7 0-1.4-.2-2l-1-1.4a4 4 0 00-1.4-1 5 5 0 00-1.8-.2c-.7 0-1.4.1-2 .4a4.6 4.6 0 00-2.5 2.7c-.3.6-.4 1.2-.4 2 0 .9.1 1.6.4 2.3a5 5 0 001 1.7c.5.5 1 .9 1.7 1 .6.3 1.3.5 2 .5zm2-6.5h-4.6a3 3 0 01.8-1.7c.4-.4 1-.7 1.7-.7l1 .2.6.5.4.8.2.9zm6.8 6.3v-7.5c.2-.3.5-.6.8-.7a2 2 0 011-.3c.6 0 1 .2 1.3.5.2.3.4.8.4 1.5v6.5h2.4v-6.5c0-.3 0-.6.2-.9 0-.2.2-.4.4-.6l.5-.4.7-.1c.6 0 1 .2 1.3.5.3.3.4.8.4 1.5v6.5h2.5v-6.5c0-.7 0-1.2-.2-1.7s-.4-.9-.7-1.2l-1.1-.8-1.5-.2a4.4 4.4 0 00-2 .4 3.3 3.3 0 00-1.3 1.5c-.2-.6-.5-1-1-1.4-.4-.3-.9-.5-1.5-.5a3.2 3.2 0 00-2.2.8l-.6.5-.2-.7c0-.3-.3-.5-.6-.5h-1.5V135h2.5zm16.8 0v-7.5c.3-.3.6-.6.9-.7a2 2 0 011-.3c.5 0 1 .2 1.2.5.3.3.4.8.4 1.5v6.5h2.5v-6.5l.1-.9.4-.6.6-.4.7-.1c.5 0 1 .2 1.2.5.3.3.5.8.5 1.5v6.5h2.4v-6.5c0-.7 0-1.2-.2-1.7-.1-.5-.4-.9-.7-1.2l-1-.8-1.6-.2a4.4 4.4 0 00-1.9.4 3.3 3.3 0 00-1.4 1.5c-.2-.6-.5-1-1-1.4-.3-.3-.9-.5-1.5-.5a3.2 3.2 0 00-2.2.8l-.6.5-.1-.7c-.1-.3-.3-.5-.7-.5H607V135h2.5zm16.8.2l1-.1a4 4 0 001.7-.7l.7-.7.3.8c0 .2.1.3.3.4l.5.1h1.1v-6.5c0-.6 0-1-.2-1.6-.2-.5-.5-.9-.8-1.2-.3-.4-.7-.7-1.2-.9-.5-.2-1-.3-1.6-.3-1.7 0-3 .6-4.3 1.7l.5.8.3.3.4.1.6-.1a25.4 25.4 0 001.2-.7 3 3 0 011-.1c.6 0 1 .2 1.3.5.3.3.4.8.4 1.5v.6c-1.2 0-2.1.2-3 .4l-1.8.7c-.5.3-.8.7-1 1l-.3 1.2c0 .4 0 .8.2 1.2a2.4 2.4 0 001.5 1.4l1.2.2zm.8-1.7c-.4 0-.7-.1-1-.3-.2-.2-.4-.5-.4-1 0-.2 0-.4.2-.6l.6-.4 1.2-.4 1.8-.2v1.8a5 5 0 01-.5.4l-.6.4-.6.2h-.7zm10.6 1.7l1.5-.3c.4-.1.8-.3 1.2-.7l-.7-1.1-.2-.2h-.2-.1a3.5 3.5 0 01-.5.2h-.4c-.3 0-.5 0-.6-.2a1 1 0 01-.3-.8v-5.5h2.7v-1.8h-2.7v-3.2H636l-.3.1-.2.3-.5 2.8-1.7.3v1c0 .1 0 .3.2.4h1.3v5.8c0 1 .3 1.6.8 2.1.4.5 1.1.8 2 .8zm5.8-11.9l.6-.1a1.6 1.6 0 00.9-.9v-.5-.7a1.6 1.6 0 00-1.5-1l-.6.2a1.6 1.6 0 00-.8.8 1.5 1.5 0 00.8 2c.2.2.4.2.6.2zm1.2 11.7v-10.3h-2.5V135h2.5zm10 0v-1.9h-5l4.9-6.4a1.7 1.7 0 00.3-1v-1H647v2h5l-5 6.4a1.4 1.4 0 00-.2.9v1h7.9zm6.6.2a8 8 0 002.2-.4l1-.5 1-.8-.7-1-.2-.1h-.3a1 1 0 00-.5 0 17.7 17.7 0 01-1.3.7l-1 .1c-.8 0-1.5-.2-2-.7-.5-.5-.8-1.3-.8-2.3h6.6l.2-.2v-.3-.5c0-.7 0-1.4-.3-2a4 4 0 00-2.3-2.4 5 5 0 00-1.8-.2c-.7 0-1.4.1-2 .4a4.6 4.6 0 00-2.6 2.7c-.2.6-.3 1.2-.3 2 0 .9.1 1.6.4 2.3a5 5 0 001 1.7c.5.5 1 .9 1.7 1 .6.3 1.2.5 2 .5zm2-6.5h-4.6a3 3 0 01.8-1.7c.4-.4 1-.7 1.7-.7l1 .2.6.5.4.8.2.9zm6.8 6.3v-6.4c.2-.5.5-1 .9-1.2a2 2 0 011.3-.5l.7.1h.7l.1-.3.3-1.8c-.3-.2-.7-.4-1.2-.4-.6 0-1.2.2-1.7.6-.5.4-1 .9-1.3 1.5l-.1-1.2c0-.3-.1-.4-.2-.5l-.6-.2h-1.4V135h2.5z"/>
-    <path fill="#FFF" fill-rule="nonzero" d="M193.4 175v-2h-3c-.3 0-.6-.1-.8-.3-.2-.2-.3-.5-.3-.9V162h-5.4v2h3.1v7.8c0 1 .3 1.8.9 2.4.5.5 1.3.8 2.3.8h3.2zm6.3-11.4c.4 0 .7-.1 1-.3.2-.2.3-.5.3-.9 0-.3-.1-.6-.3-.8-.3-.3-.6-.4-1-.4h-.4c-.4 0-.7.1-1 .4-.2.2-.3.5-.3.8 0 .4.1.7.3.9.3.2.6.3 1 .3h.4zm4 11.4v-2h-2.9V165h-5.3v2h3.1v5.8h-3.5v2.1h8.6zm6 .2c1.2 0 2.2-.3 2.9-.8.7-.5 1-1.3 1-2.3 0-.9-.3-1.6-.8-2-.6-.6-1.5-.9-2.6-1l-1.5-.2c-.9-.1-1.3-.5-1.3-1.1 0-.8.5-1.2 1.6-1.2h.7c.4 0 .7.1 1 .3.3.2.4.4.5.6h2.3c-.1-.8-.5-1.4-1.2-1.9-.6-.4-1.5-.7-2.6-.7h-.7c-1.3 0-2.2.3-2.9.8-.6.5-1 1.2-1 2.1 0 1 .3 1.6.9 2.1.5.5 1.3.8 2.5 1l1.4.1c1 .1 1.5.6 1.5 1.2 0 .4-.1.7-.5 1-.3.1-.7.3-1.3.3h-.7c-.4 0-.8-.1-1-.3-.4-.2-.6-.4-.7-.7H205c.2.8.6 1.5 1.2 2 .7.4 1.7.7 2.8.7h.7zm13.9-.2v-2H221a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2H220v-3h-2.3v3H215v2h2.7v4.8c0 1 .3 1.7.9 2.3.6.5 1.3.8 2.3.8h2.7zm6.3.2c.5 0 1 0 1.5-.2l1.2-.5 1-.9.5-1.1h-2.2c-.2.3-.4.5-.8.6a3 3 0 01-1.2.3 2 2 0 01-1.5-.6c-.4-.4-.6-.9-.6-1.5v-.7h6.4v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8l-1.8-.3c-.7 0-1.3.1-1.8.3-.5.2-1 .4-1.3.8-.4.3-.7.7-.9 1.2a4 4 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6a3.5 3.5 0 002.2 2l1.8.3zm2-6.1h-4v-.3c0-.7.1-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.5v.3zm6.4 5.9v-6.2c0-.6.2-1 .5-1.4.4-.4.8-.5 1.4-.5.6 0 1 .1 1.3.5.4.3.5.8.5 1.4v6.2h2.3v-6.5c0-1-.3-2-.9-2.6a3 3 0 00-2.4-1 3 3 0 00-1.8.5c-.4.3-.7.8-.8 1.4h-.2v-1.7h-2.1v9.9h2.2zm12.2.2c.5 0 1 0 1.5-.2l1.2-.5 1-.9.5-1.1h-2.2c-.2.3-.4.5-.8.6a3 3 0 01-1.2.3 2 2 0 01-1.5-.6c-.4-.4-.6-.9-.6-1.5v-.7h6.4v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8l-1.8-.3c-.7 0-1.3.1-1.8.3-.5.2-1 .4-1.3.8-.4.3-.7.7-.9 1.2a4 4 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6a3.5 3.5 0 002.2 2l1.8.3zm2-6.1h-4v-.3c0-.7.1-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.5v.3zm6.9 5.9v-6.2c0-.6.1-1 .5-1.4.3-.4.7-.6 1.3-.6.6 0 1.1.2 1.5.6.3.3.5.8.5 1.4v.6h2.3v-.8a4 4 0 00-.9-2.7c-.6-.7-1.4-1-2.5-1-.7 0-1.3.2-1.8.5s-.8.8-.9 1.4h-.2v-1.7h-2v9.9h2.2zM320.4 175v-2h-3c-.3 0-.6-.1-.8-.3-.2-.2-.3-.5-.3-.9V162h-5.4v2h3.1v7.8c0 1 .3 1.8.9 2.4.5.5 1.3.8 2.3.8h3.2zm6.3-11.4c.4 0 .7-.1 1-.3.2-.2.3-.5.3-.9 0-.3-.1-.6-.3-.8-.3-.3-.6-.4-1-.4h-.4c-.4 0-.7.1-1 .4-.2.2-.3.5-.3.8 0 .4.1.7.3.9.3.2.6.3 1 .3h.4zm4 11.4v-2h-2.9V165h-5.3v2h3.1v5.8h-3.5v2.1h8.6zm6 .2c1.2 0 2.2-.3 2.9-.8.7-.5 1-1.3 1-2.3 0-.9-.3-1.6-.8-2-.6-.6-1.5-.9-2.6-1l-1.5-.2c-.9-.1-1.3-.5-1.3-1.1 0-.8.5-1.2 1.6-1.2h.7c.4 0 .7.1 1 .3.3.2.4.4.5.6h2.3c-.1-.8-.5-1.4-1.2-1.9-.6-.4-1.5-.7-2.6-.7h-.7c-1.3 0-2.2.3-2.9.8-.6.5-1 1.2-1 2.1 0 1 .3 1.6.9 2.1.5.5 1.3.8 2.5 1l1.4.1c1 .1 1.5.6 1.5 1.2 0 .4-.1.7-.5 1-.3.1-.7.3-1.3.3h-.7c-.4 0-.8-.1-1-.3-.4-.2-.6-.4-.7-.7H332c.2.8.6 1.5 1.2 2 .7.4 1.7.7 2.8.7h.7zm13.9-.2v-2H348a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2H347v-3h-2.3v3H342v2h2.7v4.8c0 1 .3 1.7.9 2.3.6.5 1.3.8 2.3.8h2.7zm6.3.2c.5 0 1 0 1.5-.2l1.2-.5 1-.9.5-1.1h-2.2c-.2.3-.4.5-.8.6a3 3 0 01-1.2.3 2 2 0 01-1.5-.6c-.4-.4-.6-.9-.6-1.5v-.7h6.4v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8l-1.8-.3c-.7 0-1.3.1-1.8.3-.5.2-1 .4-1.3.8-.4.3-.7.7-.9 1.2a4 4 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6a3.5 3.5 0 002.2 2l1.8.3zm2-6.1h-4v-.3c0-.7.1-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.5v.3zm6.4 5.9v-6.2c0-.6.2-1 .5-1.4.4-.4.8-.5 1.4-.5.6 0 1 .1 1.3.5.4.3.5.8.5 1.4v6.2h2.3v-6.5c0-1-.3-2-.9-2.6a3 3 0 00-2.4-1 3 3 0 00-1.8.5c-.4.3-.7.8-.8 1.4h-.2v-1.7h-2.1v9.9h2.2zm12.2.2c.5 0 1 0 1.5-.2l1.2-.5 1-.9.5-1.1h-2.2c-.2.3-.4.5-.8.6a3 3 0 01-1.2.3 2 2 0 01-1.5-.6c-.4-.4-.6-.9-.6-1.5v-.7h6.4v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8l-1.8-.3c-.7 0-1.3.1-1.8.3-.5.2-1 .4-1.3.8-.4.3-.7.7-.9 1.2a4 4 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6a3.5 3.5 0 002.2 2l1.8.3zm2-6.1h-4v-.3c0-.7.1-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.5v.3zm6.9 5.9v-6.2c0-.6.1-1 .5-1.4.3-.4.7-.6 1.3-.6.6 0 1.1.2 1.5.6.3.3.5.8.5 1.4v.6h2.3v-.8a4 4 0 00-.9-2.7c-.6-.7-1.4-1-2.5-1-.7 0-1.3.2-1.8.5s-.8.8-.9 1.4h-.2v-1.7h-2v9.9h2.2z"/>
-    <path fill="#CCBDFE" stroke="#5E3AD7" stroke-linejoin="round" stroke-width="3.8" d="M672 90h123.5l21.5 40.9-21.5 39.1H672l21.5-39z"/>
-    <path fill="#B39DFF" stroke="#5E3AD7" stroke-linejoin="round" stroke-width="3.8" d="M817 131l-21.5 39H672l21.5-38.9z"/>
-    <path fill="#3D4251" fill-rule="nonzero" d="M727.8 121v-9.3l1.4-1c.4-.2 1-.3 1.5-.3.7 0 1.2.2 1.6.6.4.4.5 1 .5 1.8v8.2h3.1v-8.2c0-.7 0-1.3-.3-2-.1-.5-.4-1-.8-1.5-.3-.4-.8-.7-1.3-1-.6-.2-1.2-.3-1.9-.3a5.4 5.4 0 00-2.2.5 5.7 5.7 0 00-1.8 1.3l-.2-1c0-.4-.4-.6-.8-.6h-1.8V121h3zm16.6.2a10.1 10.1 0 002.8-.5 5.1 5.1 0 002.5-1.7l-.9-1-.2-.3h-.4l-.6.1a22 22 0 01-1.6.8l-1.3.2c-1 0-1.8-.3-2.5-1-.6-.5-1-1.5-1-2.8h8.3l.2-.3v-.3l.1-.6c0-1-.1-1.8-.4-2.5-.3-.7-.7-1.3-1.1-1.8-.5-.5-1.1-.9-1.8-1.1-.7-.3-1.4-.4-2.2-.4-1 0-1.8.1-2.6.5a5.7 5.7 0 00-3.2 3.3c-.3.8-.4 1.6-.4 2.5 0 1.1.2 2.1.5 3a5.6 5.6 0 003.3 3.5c.8.2 1.6.4 2.5.4zm2.6-8h-5.8c.2-1 .5-1.7 1-2.3.5-.5 1.2-.7 2.1-.7.5 0 .9 0 1.2.2l.9.6.5 1 .1 1.1zm8.4 7.8v-8a4 4 0 011.2-1.5c.4-.4 1-.6 1.6-.6l.9.1.5.1h.3l.2-.4.4-2.3c-.4-.3-1-.5-1.6-.5-.7 0-1.4.3-2 .7a6 6 0 00-1.7 1.9l-.1-1.5c0-.3-.2-.5-.3-.7l-.7-.1h-1.8V121h3.1zM716 157.1c.5 0 1 0 1.3-.2.5-.1.8-.3 1.2-.6l-.7-1-.1-.2h-.2-.2a3.2 3.2 0 01-.4.2h-.3c-.3 0-.5 0-.6-.2a1 1 0 01-.2-.7v-5h2.4v-1.6h-2.4V145h-1.2l-.3.1-.2.3-.5 2.5-1.4.3v1.2l.4.1h1v5.2c0 .8.2 1.4.6 1.9.4.4 1 .6 1.8.6zm8 0a5 5 0 001.8-.3 4 4 0 002.3-2.5c.3-.5.4-1.2.4-2 0-.7-.1-1.3-.4-2a4 4 0 00-2.4-2.4 5 5 0 00-1.8-.3c-.7 0-1.3.1-1.9.4a4 4 0 00-2.3 2.4c-.3.6-.4 1.2-.4 2 0 .7.1 1.4.4 2a4 4 0 002.4 2.5l1.8.2zm0-1.7c-.9 0-1.4-.2-1.8-.8-.4-.5-.6-1.2-.6-2.2 0-1 .2-1.8.6-2.3.4-.5 1-.8 1.7-.8.8 0 1.3.3 1.7.8.4.5.6 1.3.6 2.3 0 1-.2 1.7-.6 2.3-.4.5-1 .7-1.7.7zm8.4 1.6v-4.2h1l.2.3 2.4 3.5.3.3.4.1h2l-3-4.5-.3-.4-.4-.3c.2 0 .3-.2.4-.3l.3-.3 2.9-3.4H736l-.3.4-2.3 2.8a1 1 0 01-.3.2h-.8v-7.6h-2.2V157h2.2zm16.2 0v-1.3c0-.3 0-.4-.2-.6a.8.8 0 00-.6-.2h-3.3a7.1 7.1 0 00-1.6.3l2.9-3a52 52 0 001.8-2.1l.6-1.2a4 4 0 000-2.9 3.4 3.4 0 00-2.2-1.9c-.5-.2-1-.3-1.6-.3-.6 0-1.2.1-1.7.3a4 4 0 00-2.3 1.9l-.5 1.6 1.2.2h.3l.5-.1.3-.5c.1-.4.4-.8.7-1a2 2 0 011.3-.5l.8.1a1.6 1.6 0 011 1l.1.8-.1 1-.4 1-.7.8-.9 1-4 4a1.3 1.3 0 00-.3.9v.7h9zm6.6 0l3.7-9.2h-2.2l-.2.4-1.8 5a7.8 7.8 0 00-.5 1.6 16.4 16.4 0 00-.4-1.7l-1.8-5-.2-.2a.7.7 0 00-.5-.1h-1.8l3.7 9.2h2zm8.9.1a7.3 7.3 0 002-.3l1-.5c.3-.1.6-.4.8-.7l-.6-.8-.2-.1h-.3-.4a15.9 15.9 0 01-1.2.6 3 3 0 01-1 .1c-.6 0-1.2-.2-1.7-.7-.4-.4-.7-1-.7-2h5.9l.2-.2v-.3-.4a5 5 0 00-.2-1.8l-.8-1.3-1.3-.8a4.6 4.6 0 00-3.5 0 4.1 4.1 0 00-2.2 2.5c-.3.6-.4 1.2-.4 1.8 0 .8.2 1.5.4 2.1.2.6.6 1.2 1 1.6l1.4 1a5 5 0 001.8.2zm1.9-5.8h-4.2c.1-.7.3-1.2.7-1.5a2 2 0 011.5-.6c.4 0 .6 0 .9.2.2 0 .4.2.6.4l.4.7v.8zm7.3 5.8a6.8 6.8 0 002-.3 4.3 4.3 0 001.7-1.2l-.7-.8-.4-.2-.4.1-.4.3-.5.3-.9.1c-.3 0-.6 0-1-.2a2 2 0 01-.6-.6c-.2-.2-.4-.5-.5-1a5 5 0 01-.2-1.2c0-.5 0-1 .2-1.3 0-.4.2-.7.4-1l.8-.6 1-.2a2.5 2.5 0 011.3.4l.4.2h.6l.2-.2.6-.8-1.4-1-1.8-.3c-.8 0-1.4.1-2 .4a4 4 0 00-2.2 2.5c-.2.6-.3 1.2-.3 1.9s0 1.4.3 2c.2.6.5 1 .9 1.5l1.3 1 1.6.2z"/>
-    <g>
-      <path fill="url(#c)" fill-rule="nonzero" d="M249.5 40.5v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zM240-2h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zM6 0v1a2 2 0 104 0V0a2 2 0 10-4 0zm0 8v1a2 2 0 104 0V8a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0z" transform="translate(226 28.5)"/>
-      <path fill="#CDB217" d="M234 73l-8-16h16z"/>
-    </g>
-    <g>
-      <path fill="url(#d)" fill-rule="nonzero" d="M129.5 40.5v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zM120-2h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zM6 0v1a2 2 0 104 0V0a2 2 0 10-4 0zm0 8v1a2 2 0 104 0V8a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0z" transform="translate(485 28.5)"/>
-      <path fill="#3AD787" d="M493 73l-8-16h16z"/>
-    </g>
-  </g>
+<svg width="837" height="259" viewBox="0 0 837 259" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M27 90H175.58L199 130.855L175.58 170H27L50.4201 130.95L27 90Z" fill="#F2D7B2" stroke="#F0A439" stroke-width="3.75" stroke-linejoin="round"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M161 90H295.586L319 130.855L295.586 170H161L184.414 130.95L161 90Z" fill="#F2E7A6" stroke="#CDB217" stroke-width="3.75" stroke-linejoin="round"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M286 90H420.586L444 130.855L420.586 170H286L309.414 130.95L286 90Z" fill="#D7E99A" stroke="#B2D73A" stroke-width="3.75" stroke-linejoin="round"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M417 90H551.586L575 130.855L551.586 170H417L440.414 130.95L417 90Z" fill="#B5F3D4" stroke="#3AD787" stroke-width="3.75" stroke-linejoin="round"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M537 90H671.586L695 130.855L671.586 170H537L560.414 130.95L537 90Z" fill="#B5D8F3" stroke="#3A8DD7" stroke-width="3.75" stroke-linejoin="round"/>
+<rect x="173.75" y="157.75" width="100.5" height="23.5" rx="10.25" fill="#3D4251" stroke="#3D4251" stroke-width="3.5"/>
+<rect x="300.75" y="157.75" width="100.5" height="23.5" rx="10.25" fill="#3D4251" stroke="#3D4251" stroke-width="3.5"/>
+<rect x="454.75" y="82.75" width="47.5" height="16.5" rx="8.25" fill="#3D4251" stroke="#3D4251" stroke-width="3.5"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M482.82 96V94.418H480.524C480.254 94.418 480.037 94.3387 479.873 94.18C479.71 94.0213 479.628 93.7973 479.628 93.508V85.864H475.428V87.446H477.878V93.508C477.878 94.292 478.102 94.9033 478.55 95.342C478.998 95.7807 479.614 96 480.398 96H482.82ZM462.052 91.212V96H460.302V88.3H461.954V89.63H462.08C462.174 89.1633 462.407 88.8017 462.78 88.545C463.154 88.2883 463.63 88.16 464.208 88.16C465.039 88.16 465.688 88.4167 466.154 88.93C466.621 89.4433 466.854 90.1527 466.854 91.058V91.618H465.02V91.212C465.02 90.7173 464.885 90.3323 464.614 90.057C464.344 89.7817 463.97 89.644 463.494 89.644C463.046 89.644 462.694 89.784 462.437 90.064C462.181 90.344 462.052 90.7267 462.052 91.212ZM473.507 95.349C472.957 95.8763 472.182 96.14 471.183 96.14C470.185 96.14 469.41 95.874 468.859 95.342C468.309 94.81 468.033 94.068 468.033 93.116V88.3H469.783V93.102C469.783 94.11 470.25 94.614 471.183 94.614C472.117 94.614 472.583 94.11 472.583 93.102V88.3H474.333V93.116C474.333 94.0773 474.058 94.8217 473.507 95.349ZM488.361 95.993C488.001 96.091 487.616 96.14 487.206 96.14C486.702 96.14 486.242 96.07 485.827 95.93C485.411 95.79 485.059 95.587 484.77 95.321C484.48 95.055 484.256 94.7353 484.098 94.362C483.939 93.9887 483.86 93.5733 483.86 93.116V91.17C483.86 90.7127 483.939 90.2973 484.098 89.924C484.256 89.5507 484.48 89.2333 484.77 88.972C485.059 88.7107 485.409 88.51 485.82 88.37C486.23 88.23 486.692 88.16 487.206 88.16C487.71 88.16 488.165 88.2323 488.571 88.377C488.977 88.5217 489.324 88.7247 489.614 88.986C489.903 89.2473 490.127 89.5647 490.286 89.938C490.444 90.3113 490.524 90.7267 490.524 91.184V92.584H485.596V93.088C485.596 93.6107 485.738 94.0167 486.023 94.306C486.307 94.5953 486.702 94.74 487.206 94.74C487.579 94.74 487.901 94.6793 488.172 94.558C488.442 94.4367 488.634 94.2687 488.746 94.054H490.468C490.384 94.3713 490.244 94.6583 490.048 94.915C489.852 95.1717 489.611 95.391 489.327 95.573C489.042 95.755 488.72 95.895 488.361 95.993ZM488.816 91.408H485.596V91.17C485.596 90.666 485.738 90.2717 486.023 89.987C486.307 89.7023 486.702 89.56 487.206 89.56C487.71 89.56 488.104 89.7023 488.389 89.987C488.673 90.2717 488.816 90.6707 488.816 91.184V91.408ZM497.779 95.524C497.247 95.9347 496.481 96.14 495.483 96.14H494.923C494.017 96.14 493.296 95.9557 492.76 95.587C492.223 95.2183 491.913 94.698 491.829 94.026H493.621C493.677 94.2687 493.821 94.4577 494.055 94.593C494.288 94.7283 494.577 94.796 494.923 94.796H495.483C495.921 94.796 496.26 94.7097 496.498 94.537C496.736 94.3643 496.855 94.1287 496.855 93.83C496.855 93.3167 496.458 93.004 495.665 92.892L494.573 92.766C493.695 92.654 493.047 92.4113 492.627 92.038C492.207 91.6647 491.997 91.128 491.997 90.428C491.997 89.7 492.251 89.14 492.76 88.748C493.268 88.356 493.999 88.16 494.951 88.16H495.525C496.374 88.16 497.058 88.3397 497.576 88.699C498.094 89.0583 498.399 89.56 498.493 90.204H496.701C496.654 89.9893 496.526 89.819 496.316 89.693C496.106 89.567 495.842 89.504 495.525 89.504H494.951C494.129 89.504 493.719 89.7933 493.719 90.372C493.719 90.8667 494.064 91.156 494.755 91.24L495.889 91.394C496.803 91.506 497.48 91.7557 497.919 92.143C498.357 92.5303 498.577 93.074 498.577 93.774C498.577 94.53 498.311 95.1133 497.779 95.524Z" fill="white"/>
+<rect x="587.75" y="82.75" width="47.5" height="16.5" rx="8.25" fill="#3D4251" stroke="#3D4251" stroke-width="3.5"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M615.82 96V94.418H613.524C613.254 94.418 613.037 94.3387 612.873 94.18C612.71 94.0213 612.628 93.7973 612.628 93.508V85.864H608.428V87.446H610.878V93.508C610.878 94.292 611.102 94.9033 611.55 95.342C611.998 95.7807 612.614 96 613.398 96H615.82ZM595.052 91.212V96H593.302V88.3H594.954V89.63H595.08C595.174 89.1633 595.407 88.8017 595.78 88.545C596.154 88.2883 596.63 88.16 597.208 88.16C598.039 88.16 598.688 88.4167 599.154 88.93C599.621 89.4433 599.854 90.1527 599.854 91.058V91.618H598.02V91.212C598.02 90.7173 597.885 90.3323 597.614 90.057C597.344 89.7817 596.97 89.644 596.494 89.644C596.046 89.644 595.694 89.784 595.437 90.064C595.181 90.344 595.052 90.7267 595.052 91.212ZM606.507 95.349C605.957 95.8763 605.182 96.14 604.183 96.14C603.185 96.14 602.41 95.874 601.859 95.342C601.309 94.81 601.033 94.068 601.033 93.116V88.3H602.783V93.102C602.783 94.11 603.25 94.614 604.183 94.614C605.117 94.614 605.583 94.11 605.583 93.102V88.3H607.333V93.116C607.333 94.0773 607.058 94.8217 606.507 95.349ZM621.361 95.993C621.001 96.091 620.616 96.14 620.206 96.14C619.702 96.14 619.242 96.07 618.827 95.93C618.411 95.79 618.059 95.587 617.77 95.321C617.48 95.055 617.256 94.7353 617.098 94.362C616.939 93.9887 616.86 93.5733 616.86 93.116V91.17C616.86 90.7127 616.939 90.2973 617.098 89.924C617.256 89.5507 617.48 89.2333 617.77 88.972C618.059 88.7107 618.409 88.51 618.82 88.37C619.23 88.23 619.692 88.16 620.206 88.16C620.71 88.16 621.165 88.2323 621.571 88.377C621.977 88.5217 622.324 88.7247 622.614 88.986C622.903 89.2473 623.127 89.5647 623.286 89.938C623.444 90.3113 623.524 90.7267 623.524 91.184V92.584H618.596V93.088C618.596 93.6107 618.738 94.0167 619.023 94.306C619.307 94.5953 619.702 94.74 620.206 94.74C620.579 94.74 620.901 94.6793 621.172 94.558C621.442 94.4367 621.634 94.2687 621.746 94.054H623.468C623.384 94.3713 623.244 94.6583 623.048 94.915C622.852 95.1717 622.611 95.391 622.327 95.573C622.042 95.755 621.72 95.895 621.361 95.993ZM621.816 91.408H618.596V91.17C618.596 90.666 618.738 90.2717 619.023 89.987C619.307 89.7023 619.702 89.56 620.206 89.56C620.71 89.56 621.104 89.7023 621.389 89.987C621.673 90.2717 621.816 90.6707 621.816 91.184V91.408ZM630.779 95.524C630.247 95.9347 629.481 96.14 628.483 96.14H627.923C627.017 96.14 626.296 95.9557 625.76 95.587C625.223 95.2183 624.913 94.698 624.829 94.026H626.621C626.677 94.2687 626.821 94.4577 627.055 94.593C627.288 94.7283 627.577 94.796 627.923 94.796H628.483C628.921 94.796 629.26 94.7097 629.498 94.537C629.736 94.3643 629.855 94.1287 629.855 93.83C629.855 93.3167 629.458 93.004 628.665 92.892L627.573 92.766C626.695 92.654 626.047 92.4113 625.627 92.038C625.207 91.6647 624.997 91.128 624.997 90.428C624.997 89.7 625.251 89.14 625.76 88.748C626.268 88.356 626.999 88.16 627.951 88.16H628.525C629.374 88.16 630.058 88.3397 630.576 88.699C631.094 89.0583 631.399 89.56 631.493 90.204H629.701C629.654 89.9893 629.526 89.819 629.316 89.693C629.106 89.567 628.842 89.504 628.525 89.504H627.951C627.129 89.504 626.719 89.7933 626.719 90.372C626.719 90.8667 627.064 91.156 627.755 91.24L628.889 91.394C629.803 91.506 630.48 91.7557 630.919 92.143C631.357 92.5303 631.577 93.074 631.577 93.774C631.577 94.53 631.311 95.1133 630.779 95.524Z" fill="white"/>
+<path d="M351 185L351.043 224.593L95 224.691V208.897" stroke="url(#paint0_linear_202_2)" stroke-width="4" stroke-linecap="square"/>
+<path d="M224 185L224.043 212.593L95 212.691V196.897" stroke="url(#paint1_linear_202_2)" stroke-width="4" stroke-linecap="square"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M95 178.952L103 194.952H87L95 178.952Z" fill="#F0A439"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M209.069 132.913C208.486 133.104 207.881 133.2 207.256 133.2C206.148 133.2 205.294 132.885 204.694 132.256C204.094 131.627 203.794 130.758 203.794 129.65V122.488H202.494C202.327 122.488 202.183 122.433 202.063 122.325C201.942 122.217 201.881 122.054 201.881 121.838V120.613L203.944 120.275L204.594 116.775C204.627 116.608 204.704 116.479 204.825 116.388C204.946 116.296 205.098 116.25 205.281 116.25H206.881V120.287H210.256V122.488H206.881V129.438C206.881 129.837 206.981 130.15 207.181 130.375C207.381 130.6 207.648 130.712 207.981 130.712C208.173 130.712 208.333 130.69 208.463 130.644C208.592 130.598 208.704 130.55 208.8 130.5C208.896 130.45 208.981 130.402 209.056 130.356C209.131 130.31 209.206 130.288 209.281 130.288C209.373 130.288 209.448 130.31 209.506 130.356C209.565 130.402 209.627 130.471 209.694 130.562L210.619 132.062C210.169 132.438 209.652 132.721 209.069 132.913ZM217.056 133.087C216.681 133.163 216.265 133.2 215.806 133.2C215.265 133.2 214.765 133.127 214.306 132.981C213.848 132.835 213.454 132.617 213.125 132.325C212.796 132.033 212.54 131.671 212.356 131.238C212.173 130.804 212.081 130.3 212.081 129.725C212.081 129.242 212.208 128.765 212.463 128.294C212.717 127.823 213.14 127.398 213.731 127.019C214.323 126.64 215.111 126.325 216.094 126.075C217.077 125.825 218.298 125.683 219.756 125.65V124.9C219.756 124.042 219.575 123.406 219.213 122.994C218.85 122.581 218.323 122.375 217.631 122.375C217.131 122.375 216.715 122.433 216.381 122.55C216.048 122.667 215.758 122.798 215.513 122.944C215.267 123.09 215.04 123.221 214.831 123.338C214.623 123.454 214.394 123.512 214.144 123.512C213.936 123.512 213.756 123.458 213.606 123.35C213.456 123.242 213.336 123.108 213.244 122.95L212.681 121.963C214.156 120.613 215.936 119.938 218.019 119.938C218.769 119.938 219.438 120.06 220.025 120.306C220.613 120.552 221.111 120.894 221.519 121.331C221.927 121.769 222.238 122.292 222.45 122.9C222.663 123.508 222.769 124.175 222.769 124.9V133H221.381C221.09 133 220.861 132.956 220.694 132.869C220.527 132.781 220.402 132.604 220.319 132.337L220.044 131.425C219.719 131.717 219.4 131.973 219.088 132.194C218.775 132.415 218.452 132.6 218.119 132.75C217.786 132.9 217.431 133.012 217.056 133.087ZM216.769 131.075C216.252 131.075 215.833 130.958 215.513 130.725C215.192 130.492 215.031 130.104 215.031 129.562C215.031 129.287 215.108 129.033 215.263 128.8C215.417 128.567 215.677 128.362 216.044 128.188C216.411 128.012 216.896 127.869 217.5 127.756C218.104 127.644 218.856 127.567 219.756 127.525V129.688C219.531 129.921 219.308 130.123 219.088 130.294C218.867 130.465 218.638 130.608 218.4 130.725C218.163 130.842 217.911 130.929 217.644 130.987C217.377 131.046 217.086 131.075 216.769 131.075ZM232.906 137.131C232.14 137.385 231.265 137.513 230.281 137.513C229.306 137.513 228.461 137.419 227.744 137.231C227.027 137.044 226.433 136.794 225.963 136.481C225.492 136.169 225.14 135.808 224.906 135.4C224.673 134.992 224.556 134.567 224.556 134.125C224.556 133.525 224.74 133.021 225.106 132.613C225.473 132.204 225.981 131.879 226.631 131.637C226.315 131.462 226.061 131.229 225.869 130.938C225.677 130.646 225.581 130.267 225.581 129.8C225.581 129.608 225.615 129.41 225.681 129.206C225.748 129.002 225.85 128.8 225.988 128.6C226.125 128.4 226.298 128.21 226.506 128.031C226.715 127.852 226.961 127.692 227.244 127.55C226.594 127.2 226.083 126.733 225.713 126.15C225.342 125.567 225.156 124.883 225.156 124.1C225.156 123.467 225.283 122.894 225.538 122.381C225.792 121.869 226.146 121.431 226.6 121.069C227.054 120.706 227.592 120.429 228.213 120.237C228.833 120.046 229.511 119.95 230.244 119.95C230.794 119.95 231.311 120.006 231.794 120.119C232.277 120.231 232.719 120.396 233.119 120.613H236.806V121.762C236.806 121.954 236.756 122.104 236.656 122.213C236.556 122.321 236.386 122.396 236.144 122.438L234.994 122.65C235.077 122.867 235.142 123.096 235.188 123.338C235.233 123.579 235.256 123.833 235.256 124.1C235.256 124.733 235.129 125.306 234.875 125.819C234.621 126.331 234.271 126.767 233.825 127.125C233.379 127.483 232.85 127.76 232.238 127.956C231.625 128.152 230.961 128.25 230.244 128.25C229.761 128.25 229.29 128.204 228.831 128.113C228.431 128.354 228.231 128.625 228.231 128.925C228.231 129.183 228.35 129.373 228.588 129.494C228.825 129.615 229.138 129.7 229.525 129.75C229.913 129.8 230.352 129.831 230.844 129.844C231.336 129.856 231.84 129.883 232.356 129.925C232.873 129.967 233.377 130.04 233.869 130.144C234.361 130.248 234.8 130.413 235.188 130.637C235.575 130.863 235.888 131.169 236.125 131.556C236.363 131.944 236.481 132.442 236.481 133.05C236.481 133.617 236.342 134.167 236.063 134.7C235.783 135.233 235.379 135.708 234.85 136.125C234.321 136.542 233.673 136.877 232.906 137.131ZM230.244 126.275C229.869 126.275 229.538 126.223 229.25 126.119C228.963 126.015 228.725 125.869 228.538 125.681C228.35 125.494 228.208 125.271 228.113 125.013C228.017 124.754 227.969 124.475 227.969 124.175C227.969 123.542 228.158 123.04 228.538 122.669C228.917 122.298 229.486 122.113 230.244 122.113C231.002 122.113 231.571 122.298 231.95 122.669C232.329 123.04 232.519 123.542 232.519 124.175C232.519 124.483 232.471 124.767 232.375 125.025C232.279 125.283 232.138 125.504 231.95 125.688C231.763 125.871 231.527 126.015 231.244 126.119C230.961 126.223 230.627 126.275 230.244 126.275ZM228.894 135.194C229.286 135.273 229.765 135.312 230.331 135.312C230.906 135.312 231.402 135.269 231.819 135.181C232.236 135.094 232.579 134.973 232.85 134.819C233.121 134.665 233.319 134.481 233.444 134.269C233.569 134.056 233.631 133.825 233.631 133.575C233.631 133.325 233.556 133.121 233.406 132.962C233.256 132.804 233.052 132.681 232.794 132.594C232.536 132.506 232.233 132.442 231.888 132.4C231.542 132.358 231.175 132.327 230.788 132.306C230.4 132.285 230 132.265 229.588 132.244C229.175 132.223 228.777 132.188 228.394 132.137C228.044 132.329 227.763 132.556 227.55 132.819C227.338 133.081 227.231 133.383 227.231 133.725C227.231 133.95 227.288 134.16 227.4 134.356C227.513 134.552 227.692 134.721 227.938 134.862C228.183 135.004 228.502 135.115 228.894 135.194ZM243.219 137.513C244.202 137.513 245.077 137.385 245.844 137.131C246.611 136.877 247.258 136.542 247.788 136.125C248.317 135.708 248.721 135.233 249 134.7C249.279 134.167 249.419 133.617 249.419 133.05C249.419 132.442 249.3 131.944 249.063 131.556C248.825 131.169 248.513 130.863 248.125 130.637C247.738 130.413 247.298 130.248 246.806 130.144C246.315 130.04 245.811 129.967 245.294 129.925C244.777 129.883 244.273 129.856 243.781 129.844C243.29 129.831 242.85 129.8 242.463 129.75C242.075 129.7 241.763 129.615 241.525 129.494C241.288 129.373 241.169 129.183 241.169 128.925C241.169 128.625 241.369 128.354 241.769 128.113C242.227 128.204 242.698 128.25 243.181 128.25C243.898 128.25 244.563 128.152 245.175 127.956C245.788 127.76 246.317 127.483 246.763 127.125C247.208 126.767 247.558 126.331 247.813 125.819C248.067 125.306 248.194 124.733 248.194 124.1C248.194 123.833 248.171 123.579 248.125 123.338C248.079 123.096 248.015 122.867 247.931 122.65L249.081 122.438C249.323 122.396 249.494 122.321 249.594 122.213C249.694 122.104 249.744 121.954 249.744 121.762V120.613H246.056C245.656 120.396 245.215 120.231 244.731 120.119C244.248 120.006 243.731 119.95 243.181 119.95C242.448 119.95 241.771 120.046 241.15 120.237C240.529 120.429 239.992 120.706 239.538 121.069C239.083 121.431 238.729 121.869 238.475 122.381C238.221 122.894 238.094 123.467 238.094 124.1C238.094 124.883 238.279 125.567 238.65 126.15C239.021 126.733 239.531 127.2 240.181 127.55C239.898 127.692 239.652 127.852 239.444 128.031C239.236 128.21 239.063 128.4 238.925 128.6C238.788 128.8 238.686 129.002 238.619 129.206C238.552 129.41 238.519 129.608 238.519 129.8C238.519 130.267 238.615 130.646 238.806 130.938C238.998 131.229 239.252 131.462 239.569 131.637C238.919 131.879 238.411 132.204 238.044 132.613C237.677 133.021 237.494 133.525 237.494 134.125C237.494 134.567 237.611 134.992 237.844 135.4C238.077 135.808 238.429 136.169 238.9 136.481C239.371 136.794 239.965 137.044 240.681 137.231C241.398 137.419 242.244 137.513 243.219 137.513ZM242.188 126.119C242.475 126.223 242.806 126.275 243.181 126.275C243.565 126.275 243.898 126.223 244.181 126.119C244.465 126.015 244.7 125.871 244.888 125.688C245.075 125.504 245.217 125.283 245.313 125.025C245.408 124.767 245.456 124.483 245.456 124.175C245.456 123.542 245.267 123.04 244.888 122.669C244.508 122.298 243.94 122.113 243.181 122.113C242.423 122.113 241.854 122.298 241.475 122.669C241.096 123.04 240.906 123.542 240.906 124.175C240.906 124.475 240.954 124.754 241.05 125.013C241.146 125.271 241.288 125.494 241.475 125.681C241.663 125.869 241.9 126.015 242.188 126.119ZM243.269 135.312C242.702 135.312 242.223 135.273 241.831 135.194C241.44 135.115 241.121 135.004 240.875 134.862C240.629 134.721 240.45 134.552 240.338 134.356C240.225 134.16 240.169 133.95 240.169 133.725C240.169 133.383 240.275 133.081 240.488 132.819C240.7 132.556 240.981 132.329 241.331 132.137C241.715 132.188 242.113 132.223 242.525 132.244C242.938 132.265 243.338 132.285 243.725 132.306C244.113 132.327 244.479 132.358 244.825 132.4C245.171 132.442 245.473 132.506 245.731 132.594C245.99 132.681 246.194 132.804 246.344 132.962C246.494 133.121 246.569 133.325 246.569 133.575C246.569 133.825 246.506 134.056 246.381 134.269C246.256 134.481 246.058 134.665 245.788 134.819C245.517 134.973 245.173 135.094 244.756 135.181C244.34 135.269 243.844 135.312 243.269 135.312ZM258.525 133.087C258.046 133.154 257.581 133.188 257.131 133.188C256.24 133.188 255.411 133.04 254.644 132.744C253.877 132.448 253.211 132.01 252.644 131.431C252.077 130.852 251.631 130.135 251.306 129.281C250.981 128.427 250.819 127.438 250.819 126.312C250.819 125.438 250.961 124.615 251.244 123.844C251.527 123.073 251.933 122.402 252.463 121.831C252.992 121.26 253.638 120.808 254.4 120.475C255.163 120.142 256.023 119.975 256.981 119.975C257.79 119.975 258.533 120.104 259.213 120.362C259.892 120.621 260.477 120.998 260.969 121.494C261.461 121.99 261.844 122.598 262.119 123.319C262.394 124.04 262.531 124.863 262.531 125.788C262.531 126.021 262.521 126.215 262.5 126.369C262.479 126.523 262.442 126.644 262.388 126.731C262.333 126.819 262.261 126.881 262.169 126.919C262.077 126.956 261.961 126.975 261.819 126.975H253.894C253.986 128.292 254.34 129.258 254.956 129.875C255.573 130.492 256.39 130.8 257.406 130.8C257.906 130.8 258.338 130.742 258.7 130.625C259.063 130.508 259.379 130.379 259.65 130.237C259.921 130.096 260.158 129.967 260.363 129.85C260.567 129.733 260.765 129.675 260.956 129.675C261.081 129.675 261.19 129.7 261.281 129.75C261.373 129.8 261.452 129.871 261.519 129.962L262.419 131.087C262.077 131.488 261.694 131.823 261.269 132.094C260.844 132.365 260.4 132.581 259.938 132.744C259.475 132.906 259.004 133.021 258.525 133.087ZM259.756 125.113H253.956C254.098 124.179 254.423 123.458 254.931 122.95C255.44 122.442 256.144 122.188 257.044 122.188C257.502 122.188 257.902 122.265 258.244 122.419C258.586 122.573 258.869 122.783 259.094 123.05C259.319 123.317 259.486 123.627 259.594 123.981C259.702 124.335 259.756 124.712 259.756 125.113ZM268.131 125.012V133H265.044V120.175H266.856C267.173 120.175 267.394 120.233 267.519 120.35C267.644 120.467 267.727 120.667 267.769 120.95L267.956 122.5C268.415 121.708 268.952 121.083 269.569 120.625C270.186 120.167 270.877 119.938 271.644 119.938C272.277 119.938 272.802 120.083 273.219 120.375L272.819 122.688C272.794 122.838 272.74 122.944 272.656 123.006C272.573 123.069 272.461 123.1 272.319 123.1C272.194 123.1 272.023 123.071 271.806 123.012C271.59 122.954 271.302 122.925 270.944 122.925C270.302 122.925 269.752 123.102 269.294 123.456C268.836 123.81 268.448 124.329 268.131 125.012Z" fill="#3D4251"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M93.7624 138V132.125H94.4624C94.7124 132.125 94.8999 132.154 95.0249 132.213C95.1499 132.271 95.2749 132.396 95.3999 132.588L98.6624 137.45C98.7957 137.65 98.9457 137.792 99.1124 137.875C99.2791 137.958 99.4832 138 99.7249 138H102.512L98.2374 131.775C98.1041 131.583 97.9707 131.404 97.8374 131.238C97.7041 131.071 97.5499 130.925 97.3749 130.8C97.5499 130.692 97.7124 130.565 97.8624 130.419C98.0124 130.273 98.1582 130.113 98.2999 129.938L102.287 125.175H99.4624C99.2041 125.175 98.9957 125.223 98.8374 125.319C98.6791 125.415 98.5291 125.546 98.3874 125.713L95.1999 129.65C95.0666 129.808 94.9416 129.917 94.8249 129.975C94.7082 130.033 94.5457 130.063 94.3374 130.063H93.7624V119.425H90.6749V138H93.7624ZM70.9624 138.2C71.5874 138.2 72.1916 138.104 72.7749 137.913C73.3582 137.721 73.8749 137.438 74.3249 137.063L73.3999 135.563C73.3332 135.471 73.2707 135.402 73.2124 135.356C73.1541 135.31 73.0791 135.288 72.9874 135.288C72.9124 135.288 72.8374 135.31 72.7624 135.356C72.6874 135.402 72.602 135.45 72.5062 135.5C72.4103 135.55 72.2978 135.598 72.1687 135.644C72.0395 135.69 71.8791 135.713 71.6874 135.713C71.3541 135.713 71.0874 135.6 70.8874 135.375C70.6874 135.15 70.5874 134.838 70.5874 134.438V127.488H73.9624V125.288H70.5874V121.25H68.9874C68.8041 121.25 68.652 121.296 68.5312 121.388C68.4103 121.479 68.3332 121.608 68.2999 121.775L67.6499 125.275L65.5874 125.613V126.838C65.5874 127.054 65.6478 127.217 65.7687 127.325C65.8895 127.433 66.0332 127.488 66.1999 127.488H67.4999V134.65C67.4999 135.758 67.7999 136.627 68.3999 137.256C68.9999 137.885 69.8541 138.2 70.9624 138.2ZM81.8999 138.188C82.8582 138.188 83.727 138.033 84.5062 137.725C85.2853 137.417 85.9499 136.975 86.4999 136.4C87.0499 135.825 87.4749 135.129 87.7749 134.313C88.0749 133.496 88.2249 132.579 88.2249 131.563C88.2249 130.554 88.0749 129.642 87.7749 128.825C87.4749 128.008 87.0499 127.317 86.4999 126.75C85.9499 126.183 85.2853 125.746 84.5062 125.438C83.727 125.129 82.8582 124.975 81.8999 124.975C80.9416 124.975 80.0707 125.129 79.2874 125.438C78.5041 125.746 77.8353 126.183 77.2812 126.75C76.727 127.317 76.2978 128.008 75.9937 128.825C75.6895 129.642 75.5374 130.554 75.5374 131.563C75.5374 132.579 75.6895 133.496 75.9937 134.313C76.2978 135.129 76.727 135.825 77.2812 136.4C77.8353 136.975 78.5041 137.417 79.2874 137.725C80.0707 138.033 80.9416 138.188 81.8999 138.188ZM79.4999 134.731C80.0166 135.452 80.8166 135.813 81.8999 135.813C82.9666 135.813 83.7562 135.454 84.2687 134.738C84.7812 134.021 85.0374 132.971 85.0374 131.588C85.0374 130.204 84.7812 129.15 84.2687 128.425C83.7562 127.7 82.9666 127.338 81.8999 127.338C80.8166 127.338 80.0166 127.702 79.4999 128.431C78.9832 129.16 78.7249 130.213 78.7249 131.588C78.7249 132.963 78.9832 134.01 79.4999 134.731ZM116.225 136.15V138H103.825V136.975C103.825 136.767 103.869 136.55 103.956 136.325C104.044 136.1 104.183 135.892 104.375 135.7L109.862 130.2C110.321 129.733 110.731 129.288 111.094 128.863C111.456 128.438 111.76 128.017 112.006 127.6C112.252 127.183 112.439 126.76 112.569 126.331C112.698 125.902 112.762 125.45 112.762 124.975C112.762 124.542 112.7 124.16 112.575 123.831C112.45 123.502 112.271 123.225 112.037 123C111.804 122.775 111.527 122.606 111.206 122.494C110.885 122.381 110.525 122.325 110.125 122.325C109.383 122.325 108.773 122.513 108.294 122.888C107.814 123.263 107.479 123.767 107.287 124.4C107.196 124.717 107.058 124.944 106.875 125.081C106.692 125.219 106.458 125.288 106.175 125.288C106.05 125.288 105.912 125.275 105.762 125.25L104.137 124.963C104.262 124.096 104.504 123.335 104.862 122.681C105.221 122.027 105.669 121.481 106.206 121.044C106.744 120.606 107.36 120.277 108.056 120.056C108.752 119.835 109.5 119.725 110.3 119.725C111.133 119.725 111.896 119.848 112.587 120.094C113.279 120.34 113.871 120.688 114.362 121.138C114.854 121.588 115.237 122.129 115.512 122.763C115.787 123.396 115.925 124.1 115.925 124.875C115.925 125.542 115.827 126.158 115.631 126.725C115.435 127.292 115.173 127.833 114.844 128.35C114.514 128.867 114.129 129.367 113.687 129.85C113.246 130.333 112.783 130.825 112.3 131.325L108.225 135.488C108.625 135.371 109.017 135.281 109.4 135.219C109.783 135.156 110.15 135.125 110.5 135.125H115.137C115.471 135.125 115.735 135.219 115.931 135.406C116.127 135.594 116.225 135.842 116.225 136.15ZM130.437 125.175H128C127.792 125.175 127.612 125.229 127.462 125.338C127.312 125.446 127.208 125.583 127.15 125.75L124.612 132.6C124.462 133 124.339 133.392 124.244 133.775C124.148 134.158 124.062 134.542 123.987 134.925C123.912 134.542 123.827 134.158 123.731 133.775C123.635 133.392 123.517 133 123.375 132.6L120.9 125.75C120.842 125.583 120.735 125.446 120.581 125.338C120.427 125.229 120.237 125.175 120.012 125.175H117.45L122.55 138H125.35L130.437 125.175ZM139.106 138.088C138.627 138.154 138.162 138.188 137.712 138.188C136.821 138.188 135.992 138.04 135.225 137.744C134.458 137.448 133.792 137.01 133.225 136.431C132.658 135.852 132.212 135.135 131.887 134.281C131.562 133.427 131.4 132.438 131.4 131.313C131.4 130.438 131.542 129.615 131.825 128.844C132.108 128.073 132.514 127.402 133.044 126.831C133.573 126.26 134.219 125.808 134.981 125.475C135.744 125.142 136.604 124.975 137.562 124.975C138.371 124.975 139.114 125.104 139.794 125.363C140.473 125.621 141.058 125.998 141.55 126.494C142.042 126.99 142.425 127.598 142.7 128.319C142.975 129.04 143.112 129.863 143.112 130.788C143.112 131.021 143.102 131.215 143.081 131.369C143.06 131.523 143.023 131.644 142.969 131.731C142.914 131.819 142.842 131.881 142.75 131.919C142.658 131.956 142.542 131.975 142.4 131.975H134.475C134.567 133.292 134.921 134.258 135.537 134.875C136.154 135.492 136.971 135.8 137.987 135.8C138.487 135.8 138.919 135.742 139.281 135.625C139.644 135.508 139.96 135.379 140.231 135.238C140.502 135.096 140.739 134.967 140.944 134.85C141.148 134.733 141.346 134.675 141.537 134.675C141.662 134.675 141.771 134.7 141.862 134.75C141.954 134.8 142.033 134.871 142.1 134.963L143 136.088C142.658 136.488 142.275 136.823 141.85 137.094C141.425 137.365 140.981 137.581 140.519 137.744C140.056 137.906 139.585 138.021 139.106 138.088ZM140.337 130.113H134.537C134.679 129.179 135.004 128.458 135.512 127.95C136.021 127.442 136.725 127.188 137.625 127.188C138.083 127.188 138.483 127.265 138.825 127.419C139.167 127.573 139.45 127.783 139.675 128.05C139.9 128.317 140.067 128.627 140.175 128.981C140.283 129.335 140.337 129.713 140.337 130.113ZM151.887 138.088C151.429 138.154 150.975 138.188 150.525 138.188C149.733 138.188 148.987 138.04 148.287 137.744C147.587 137.448 146.977 137.017 146.456 136.45C145.935 135.883 145.523 135.19 145.219 134.369C144.914 133.548 144.762 132.613 144.762 131.563C144.762 130.621 144.898 129.748 145.169 128.944C145.439 128.14 145.837 127.444 146.362 126.856C146.887 126.269 147.537 125.808 148.312 125.475C149.087 125.142 149.979 124.975 150.987 124.975C151.946 124.975 152.785 125.129 153.506 125.438C154.227 125.746 154.875 126.188 155.45 126.763L154.637 127.888C154.546 128.004 154.456 128.096 154.369 128.163C154.281 128.229 154.154 128.263 153.987 128.263C153.829 128.263 153.675 128.215 153.525 128.119C153.375 128.023 153.196 127.915 152.987 127.794C152.779 127.673 152.531 127.565 152.244 127.469C151.956 127.373 151.6 127.325 151.175 127.325C150.633 127.325 150.158 127.423 149.75 127.619C149.342 127.815 149.002 128.096 148.731 128.463C148.46 128.829 148.258 129.273 148.125 129.794C147.992 130.315 147.925 130.904 147.925 131.563C147.925 132.246 147.998 132.854 148.144 133.388C148.289 133.921 148.5 134.369 148.775 134.731C149.05 135.094 149.383 135.369 149.775 135.556C150.167 135.744 150.608 135.838 151.1 135.838C151.592 135.838 151.989 135.777 152.294 135.656C152.598 135.535 152.854 135.402 153.062 135.256C153.271 135.11 153.452 134.977 153.606 134.856C153.76 134.735 153.933 134.675 154.125 134.675C154.375 134.675 154.562 134.771 154.687 134.963L155.575 136.088C155.233 136.488 154.862 136.823 154.462 137.094C154.062 137.365 153.648 137.581 153.219 137.744C152.789 137.906 152.346 138.021 151.887 138.088Z" fill="#3D4251"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M330.737 137.188V131.938C331.137 132.329 331.589 132.635 332.094 132.856C332.598 133.077 333.212 133.188 333.937 133.188C334.779 133.188 335.539 133.019 336.219 132.681C336.898 132.344 337.477 131.875 337.956 131.275C338.435 130.675 338.804 129.967 339.062 129.15C339.321 128.333 339.45 127.446 339.45 126.488C339.45 125.438 339.335 124.506 339.106 123.694C338.877 122.881 338.552 122.198 338.131 121.644C337.71 121.09 337.21 120.667 336.631 120.375C336.052 120.083 335.412 119.938 334.712 119.938C333.812 119.938 333.027 120.121 332.356 120.488C331.685 120.854 331.092 121.333 330.575 121.925L330.325 120.738C330.283 120.546 330.192 120.404 330.05 120.312C329.908 120.221 329.737 120.175 329.537 120.175H327.65V137.188H330.737ZM333.212 130.837C332.737 130.837 332.294 130.752 331.881 130.581C331.469 130.41 331.087 130.108 330.737 129.675V123.925C331.129 123.433 331.552 123.052 332.006 122.781C332.46 122.51 333.008 122.375 333.65 122.375C334.058 122.375 334.423 122.45 334.744 122.6C335.064 122.75 335.337 122.987 335.562 123.312C335.787 123.638 335.96 124.06 336.081 124.581C336.202 125.102 336.262 125.738 336.262 126.488C336.262 127.229 336.192 127.873 336.05 128.419C335.908 128.965 335.706 129.417 335.444 129.775C335.181 130.133 334.86 130.4 334.481 130.575C334.102 130.75 333.679 130.837 333.212 130.837ZM346.125 133.087C345.75 133.162 345.333 133.2 344.875 133.2C344.333 133.2 343.833 133.127 343.375 132.981C342.917 132.835 342.523 132.617 342.194 132.325C341.864 132.033 341.608 131.671 341.425 131.238C341.242 130.804 341.15 130.3 341.15 129.725C341.15 129.242 341.277 128.765 341.531 128.294C341.785 127.823 342.208 127.398 342.8 127.019C343.392 126.64 344.179 126.325 345.162 126.075C346.146 125.825 347.367 125.683 348.825 125.65V124.9C348.825 124.042 348.644 123.406 348.281 122.994C347.919 122.581 347.392 122.375 346.7 122.375C346.2 122.375 345.783 122.433 345.45 122.55C345.117 122.667 344.827 122.798 344.581 122.944C344.335 123.09 344.108 123.221 343.9 123.338C343.692 123.454 343.462 123.513 343.212 123.513C343.004 123.513 342.825 123.458 342.675 123.35C342.525 123.242 342.404 123.108 342.312 122.95L341.75 121.963C343.225 120.613 345.004 119.938 347.087 119.938C347.837 119.938 348.506 120.06 349.094 120.306C349.681 120.552 350.179 120.894 350.587 121.331C350.996 121.769 351.306 122.292 351.519 122.9C351.731 123.508 351.837 124.175 351.837 124.9V133H350.45C350.158 133 349.929 132.956 349.762 132.869C349.596 132.781 349.471 132.604 349.387 132.337L349.112 131.425C348.787 131.717 348.469 131.973 348.156 132.194C347.844 132.415 347.521 132.6 347.187 132.75C346.854 132.9 346.5 133.012 346.125 133.087ZM345.837 131.075C345.321 131.075 344.902 130.958 344.581 130.725C344.26 130.492 344.1 130.104 344.1 129.562C344.1 129.288 344.177 129.033 344.331 128.8C344.485 128.567 344.746 128.363 345.112 128.188C345.479 128.012 345.964 127.869 346.569 127.756C347.173 127.644 347.925 127.567 348.825 127.525V129.688C348.6 129.921 348.377 130.123 348.156 130.294C347.935 130.465 347.706 130.608 347.469 130.725C347.231 130.842 346.979 130.929 346.712 130.988C346.446 131.046 346.154 131.075 345.837 131.075ZM357.962 125.012V133H354.875V120.175H356.687C357.004 120.175 357.225 120.233 357.35 120.35C357.475 120.467 357.558 120.667 357.6 120.95L357.787 122.5C358.246 121.708 358.783 121.083 359.4 120.625C360.017 120.167 360.708 119.938 361.475 119.938C362.108 119.938 362.633 120.083 363.05 120.375L362.65 122.688C362.625 122.838 362.571 122.944 362.487 123.006C362.404 123.069 362.292 123.1 362.15 123.1C362.025 123.1 361.854 123.071 361.637 123.013C361.421 122.954 361.133 122.925 360.775 122.925C360.133 122.925 359.583 123.102 359.125 123.456C358.667 123.81 358.279 124.329 357.962 125.012ZM368.412 133.2C369.237 133.2 369.969 133.094 370.606 132.881C371.244 132.669 371.779 132.373 372.212 131.994C372.646 131.615 372.975 131.165 373.2 130.644C373.425 130.123 373.537 129.554 373.537 128.938C373.537 128.421 373.448 127.979 373.269 127.612C373.089 127.246 372.852 126.931 372.556 126.669C372.26 126.406 371.925 126.19 371.55 126.019C371.175 125.848 370.792 125.696 370.4 125.562C370.008 125.429 369.625 125.306 369.25 125.194C368.875 125.081 368.539 124.954 368.244 124.812C367.948 124.671 367.71 124.506 367.531 124.319C367.352 124.131 367.262 123.9 367.262 123.625C367.262 123.208 367.437 122.871 367.787 122.612C368.137 122.354 368.617 122.225 369.225 122.225C369.617 122.225 369.956 122.267 370.244 122.35C370.531 122.433 370.787 122.527 371.012 122.631C371.237 122.735 371.435 122.829 371.606 122.912C371.777 122.996 371.937 123.037 372.087 123.037C372.229 123.037 372.346 123.01 372.437 122.956C372.529 122.902 372.617 122.808 372.7 122.675L373.4 121.562C372.917 121.087 372.312 120.704 371.587 120.412C370.862 120.121 370.058 119.975 369.175 119.975C368.383 119.975 367.685 120.081 367.081 120.294C366.477 120.506 365.973 120.79 365.569 121.144C365.164 121.498 364.858 121.912 364.65 122.387C364.442 122.862 364.337 123.362 364.337 123.887C364.337 124.454 364.427 124.938 364.606 125.338C364.785 125.738 365.023 126.075 365.319 126.35C365.614 126.625 365.952 126.852 366.331 127.031C366.71 127.21 367.098 127.367 367.494 127.5C367.889 127.633 368.277 127.756 368.656 127.869C369.035 127.981 369.373 128.108 369.669 128.25C369.964 128.392 370.202 128.558 370.381 128.75C370.56 128.942 370.65 129.188 370.65 129.488C370.65 129.688 370.61 129.881 370.531 130.069C370.452 130.256 370.327 130.421 370.156 130.562C369.985 130.704 369.769 130.819 369.506 130.906C369.244 130.994 368.929 131.038 368.562 131.038C368.096 131.038 367.71 130.983 367.406 130.875C367.102 130.767 366.837 130.65 366.612 130.525C366.387 130.4 366.189 130.283 366.019 130.175C365.848 130.067 365.667 130.013 365.475 130.013C365.283 130.013 365.125 130.05 365 130.125C364.875 130.2 364.767 130.308 364.675 130.45L363.962 131.625C364.212 131.85 364.502 132.058 364.831 132.25C365.16 132.442 365.519 132.608 365.906 132.75C366.294 132.892 366.698 133.002 367.119 133.081C367.539 133.16 367.971 133.2 368.412 133.2ZM382.856 133.088C382.377 133.154 381.912 133.188 381.462 133.188C380.571 133.188 379.742 133.04 378.975 132.744C378.208 132.448 377.542 132.01 376.975 131.431C376.408 130.852 375.962 130.135 375.637 129.281C375.312 128.427 375.15 127.438 375.15 126.312C375.15 125.438 375.292 124.615 375.575 123.844C375.858 123.073 376.264 122.402 376.794 121.831C377.323 121.26 377.969 120.808 378.731 120.475C379.494 120.142 380.354 119.975 381.312 119.975C382.121 119.975 382.864 120.104 383.544 120.362C384.223 120.621 384.808 120.998 385.3 121.494C385.792 121.99 386.175 122.598 386.45 123.319C386.725 124.04 386.862 124.863 386.862 125.787C386.862 126.021 386.852 126.215 386.831 126.369C386.81 126.523 386.773 126.644 386.719 126.731C386.664 126.819 386.592 126.881 386.5 126.919C386.408 126.956 386.292 126.975 386.15 126.975H378.225C378.317 128.292 378.671 129.258 379.287 129.875C379.904 130.492 380.721 130.8 381.737 130.8C382.237 130.8 382.669 130.742 383.031 130.625C383.394 130.508 383.71 130.379 383.981 130.238C384.252 130.096 384.489 129.967 384.694 129.85C384.898 129.733 385.096 129.675 385.287 129.675C385.412 129.675 385.521 129.7 385.612 129.75C385.704 129.8 385.783 129.871 385.85 129.963L386.75 131.088C386.408 131.488 386.025 131.823 385.6 132.094C385.175 132.365 384.731 132.581 384.269 132.744C383.806 132.906 383.335 133.021 382.856 133.088ZM384.087 125.113H378.287C378.429 124.179 378.754 123.458 379.262 122.95C379.771 122.442 380.475 122.188 381.375 122.188C381.833 122.188 382.233 122.265 382.575 122.419C382.917 122.573 383.2 122.783 383.425 123.05C383.65 123.317 383.817 123.627 383.925 123.981C384.033 124.335 384.087 124.713 384.087 125.113ZM392.462 125.012V133H389.375V120.175H391.187C391.504 120.175 391.725 120.233 391.85 120.35C391.975 120.467 392.058 120.667 392.1 120.95L392.287 122.5C392.746 121.708 393.283 121.083 393.9 120.625C394.517 120.167 395.208 119.938 395.975 119.938C396.608 119.938 397.133 120.083 397.55 120.375L397.15 122.688C397.125 122.838 397.071 122.944 396.987 123.006C396.904 123.069 396.792 123.1 396.65 123.1C396.525 123.1 396.354 123.071 396.137 123.013C395.921 122.954 395.633 122.925 395.275 122.925C394.633 122.925 394.083 123.102 393.625 123.456C393.167 123.81 392.779 124.329 392.462 125.012Z" fill="#3D4251"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M497.935 127.745C497.392 128.015 496.783 128.15 496.11 128.15C495.797 128.15 495.51 128.118 495.25 128.055C494.99 127.992 494.753 127.903 494.54 127.79C494.327 127.677 494.128 127.538 493.945 127.375C493.762 127.212 493.587 127.03 493.42 126.83L493.31 127.52C493.27 127.693 493.202 127.817 493.105 127.89C493.008 127.963 492.877 128 492.71 128H491.08V113.14H493.55V119C493.957 118.567 494.417 118.222 494.93 117.965C495.443 117.708 496.043 117.58 496.73 117.58C497.29 117.58 497.802 117.695 498.265 117.925C498.728 118.155 499.128 118.49 499.465 118.93C499.802 119.37 500.062 119.913 500.245 120.56C500.428 121.207 500.52 121.95 500.52 122.79C500.52 123.557 500.417 124.267 500.21 124.92C500.003 125.573 499.708 126.14 499.325 126.62C498.942 127.1 498.478 127.475 497.935 127.745ZM487.65 116.175C487.457 116.258 487.25 116.3 487.03 116.3C486.817 116.3 486.615 116.258 486.425 116.175C486.235 116.092 486.07 115.98 485.93 115.84C485.79 115.7 485.678 115.537 485.595 115.35C485.512 115.163 485.47 114.963 485.47 114.75C485.47 114.53 485.512 114.323 485.595 114.13C485.678 113.937 485.79 113.77 485.93 113.63C486.07 113.49 486.235 113.378 486.425 113.295C486.615 113.212 486.817 113.17 487.03 113.17C487.25 113.17 487.457 113.212 487.65 113.295C487.843 113.378 488.012 113.49 488.155 113.63C488.298 113.77 488.413 113.937 488.5 114.13C488.587 114.323 488.63 114.53 488.63 114.75C488.63 114.963 488.587 115.163 488.5 115.35C488.413 115.537 488.298 115.7 488.155 115.84C488.012 115.98 487.843 116.092 487.65 116.175ZM465.35 128.16C465.85 128.16 466.333 128.083 466.8 127.93C467.267 127.777 467.68 127.55 468.04 127.25L467.3 126.05C467.247 125.977 467.197 125.922 467.15 125.885C467.103 125.848 467.043 125.83 466.97 125.83C466.91 125.83 466.85 125.848 466.79 125.885C466.73 125.922 466.662 125.96 466.585 126C466.508 126.04 466.418 126.078 466.315 126.115C466.212 126.152 466.083 126.17 465.93 126.17C465.663 126.17 465.45 126.08 465.29 125.9C465.13 125.72 465.05 125.47 465.05 125.15V119.59H467.75V117.83H465.05V114.6H463.77C463.623 114.6 463.502 114.637 463.405 114.71C463.308 114.783 463.247 114.887 463.22 115.02L462.7 117.82L461.05 118.09V119.07C461.05 119.243 461.098 119.373 461.195 119.46C461.292 119.547 461.407 119.59 461.54 119.59H462.58V125.32C462.58 126.207 462.82 126.902 463.3 127.405C463.78 127.908 464.463 128.16 465.35 128.16ZM454.98 128.07C454.68 128.13 454.347 128.16 453.98 128.16C453.547 128.16 453.147 128.102 452.78 127.985C452.413 127.868 452.098 127.693 451.835 127.46C451.572 127.227 451.367 126.937 451.22 126.59C451.073 126.243 451 125.84 451 125.38C451 124.993 451.102 124.612 451.305 124.235C451.508 123.858 451.847 123.518 452.32 123.215C452.793 122.912 453.423 122.66 454.21 122.46C454.997 122.26 455.973 122.147 457.14 122.12V121.52C457.14 120.833 456.995 120.325 456.705 119.995C456.415 119.665 455.993 119.5 455.44 119.5C455.04 119.5 454.707 119.547 454.44 119.64C454.173 119.733 453.942 119.838 453.745 119.955C453.548 120.072 453.367 120.177 453.2 120.27C453.033 120.363 452.85 120.41 452.65 120.41C452.483 120.41 452.34 120.367 452.22 120.28C452.1 120.193 452.003 120.087 451.93 119.96L451.48 119.17C452.66 118.09 454.083 117.55 455.75 117.55C456.35 117.55 456.885 117.648 457.355 117.845C457.825 118.042 458.223 118.315 458.55 118.665C458.877 119.015 459.125 119.433 459.295 119.92C459.465 120.407 459.55 120.94 459.55 121.52V128H458.44C458.207 128 458.023 127.965 457.89 127.895C457.757 127.825 457.657 127.683 457.59 127.47L457.37 126.74C457.11 126.973 456.855 127.178 456.605 127.355C456.355 127.532 456.097 127.68 455.83 127.8C455.563 127.92 455.28 128.01 454.98 128.07ZM453.745 126.18C454.002 126.367 454.337 126.46 454.75 126.46C455.003 126.46 455.237 126.437 455.45 126.39C455.663 126.343 455.865 126.273 456.055 126.18C456.245 126.087 456.428 125.972 456.605 125.835C456.782 125.698 456.96 125.537 457.14 125.35V123.62C456.42 123.653 455.818 123.715 455.335 123.805C454.852 123.895 454.463 124.01 454.17 124.15C453.877 124.29 453.668 124.453 453.545 124.64C453.422 124.827 453.36 125.03 453.36 125.25C453.36 125.683 453.488 125.993 453.745 126.18ZM474.54 127.93C474.073 128.083 473.59 128.16 473.09 128.16C472.203 128.16 471.52 127.908 471.04 127.405C470.56 126.902 470.32 126.207 470.32 125.32V119.59H469.28C469.147 119.59 469.032 119.547 468.935 119.46C468.838 119.373 468.79 119.243 468.79 119.07V118.09L470.44 117.82L470.96 115.02C470.987 114.887 471.048 114.783 471.145 114.71C471.242 114.637 471.363 114.6 471.51 114.6H472.79V117.83H475.49V119.59H472.79V125.15C472.79 125.47 472.87 125.72 473.03 125.9C473.19 126.08 473.403 126.17 473.67 126.17C473.823 126.17 473.952 126.152 474.055 126.115C474.158 126.078 474.248 126.04 474.325 126C474.402 125.96 474.47 125.922 474.53 125.885C474.59 125.848 474.65 125.83 474.71 125.83C474.783 125.83 474.843 125.848 474.89 125.885C474.937 125.922 474.987 125.977 475.04 126.05L475.78 127.25C475.42 127.55 475.007 127.777 474.54 127.93ZM479.93 128V121.61C480.183 121.063 480.493 120.648 480.86 120.365C481.227 120.082 481.667 119.94 482.18 119.94C482.467 119.94 482.697 119.963 482.87 120.01C483.043 120.057 483.18 120.08 483.28 120.08C483.393 120.08 483.483 120.055 483.55 120.005C483.617 119.955 483.66 119.87 483.68 119.75L484 117.9C483.667 117.667 483.247 117.55 482.74 117.55C482.127 117.55 481.573 117.733 481.08 118.1C480.587 118.467 480.157 118.967 479.79 119.6L479.64 118.36C479.607 118.133 479.54 117.973 479.44 117.88C479.34 117.787 479.163 117.74 478.91 117.74H477.46V128H479.93ZM488.26 128V117.74H485.79V128H488.26ZM495.54 126.27C495.153 126.27 494.795 126.202 494.465 126.065C494.135 125.928 493.83 125.687 493.55 125.34V120.74C493.863 120.347 494.202 120.042 494.565 119.825C494.928 119.608 495.367 119.5 495.88 119.5C496.207 119.5 496.498 119.56 496.755 119.68C497.012 119.8 497.23 119.99 497.41 120.25C497.59 120.51 497.728 120.848 497.825 121.265C497.922 121.682 497.97 122.19 497.97 122.79C497.97 123.383 497.913 123.898 497.8 124.335C497.687 124.772 497.525 125.133 497.315 125.42C497.105 125.707 496.85 125.92 496.55 126.06C496.25 126.2 495.913 126.27 495.54 126.27ZM506.655 128.055C506.358 128.125 506.04 128.16 505.7 128.16C505.14 128.16 504.645 128.065 504.215 127.875C503.785 127.685 503.423 127.417 503.13 127.07C502.837 126.723 502.615 126.312 502.465 125.835C502.315 125.358 502.24 124.833 502.24 124.26V117.74H504.71V124.26C504.71 124.887 504.855 125.372 505.145 125.715C505.435 126.058 505.87 126.23 506.45 126.23C506.877 126.23 507.277 126.135 507.65 125.945C508.023 125.755 508.377 125.493 508.71 125.16V117.74H511.18V128H509.67C509.35 128 509.14 127.85 509.04 127.55L508.87 126.73C508.657 126.943 508.437 127.138 508.21 127.315C507.983 127.492 507.742 127.642 507.485 127.765C507.228 127.888 506.952 127.985 506.655 128.055ZM517.19 128.16C517.69 128.16 518.173 128.083 518.64 127.93C519.107 127.777 519.52 127.55 519.88 127.25L519.14 126.05C519.087 125.977 519.037 125.922 518.99 125.885C518.943 125.848 518.883 125.83 518.81 125.83C518.75 125.83 518.69 125.848 518.63 125.885C518.57 125.922 518.502 125.96 518.425 126C518.348 126.04 518.258 126.078 518.155 126.115C518.052 126.152 517.923 126.17 517.77 126.17C517.503 126.17 517.29 126.08 517.13 125.9C516.97 125.72 516.89 125.47 516.89 125.15V119.59H519.59V117.83H516.89V114.6H515.61C515.463 114.6 515.342 114.637 515.245 114.71C515.148 114.783 515.087 114.887 515.06 115.02L514.54 117.82L512.89 118.09V119.07C512.89 119.243 512.938 119.373 513.035 119.46C513.132 119.547 513.247 119.59 513.38 119.59H514.42V125.32C514.42 126.207 514.66 126.902 515.14 127.405C515.62 127.908 516.303 128.16 517.19 128.16ZM527.035 128.07C526.652 128.123 526.28 128.15 525.92 128.15C525.207 128.15 524.543 128.032 523.93 127.795C523.317 127.558 522.783 127.208 522.33 126.745C521.877 126.282 521.52 125.708 521.26 125.025C521 124.342 520.87 123.55 520.87 122.65C520.87 121.95 520.983 121.292 521.21 120.675C521.437 120.058 521.762 119.522 522.185 119.065C522.608 118.608 523.125 118.247 523.735 117.98C524.345 117.713 525.033 117.58 525.8 117.58C526.447 117.58 527.042 117.683 527.585 117.89C528.128 118.097 528.597 118.398 528.99 118.795C529.383 119.192 529.69 119.678 529.91 120.255C530.13 120.832 530.24 121.49 530.24 122.23C530.24 122.417 530.232 122.572 530.215 122.695C530.198 122.818 530.168 122.915 530.125 122.985C530.082 123.055 530.023 123.105 529.95 123.135C529.877 123.165 529.783 123.18 529.67 123.18H523.33C523.403 124.233 523.687 125.007 524.18 125.5C524.673 125.993 525.327 126.24 526.14 126.24C526.54 126.24 526.885 126.193 527.175 126.1C527.465 126.007 527.718 125.903 527.935 125.79C528.152 125.677 528.342 125.573 528.505 125.48C528.668 125.387 528.827 125.34 528.98 125.34C529.08 125.34 529.167 125.36 529.24 125.4C529.313 125.44 529.377 125.497 529.43 125.57L530.15 126.47C529.877 126.79 529.57 127.058 529.23 127.275C528.89 127.492 528.535 127.665 528.165 127.795C527.795 127.925 527.418 128.017 527.035 128.07ZM528.02 121.69H523.38C523.493 120.943 523.753 120.367 524.16 119.96C524.567 119.553 525.13 119.35 525.85 119.35C526.217 119.35 526.537 119.412 526.81 119.535C527.083 119.658 527.31 119.827 527.49 120.04C527.67 120.253 527.803 120.502 527.89 120.785C527.977 121.068 528.02 121.37 528.02 121.69ZM538.81 129.34V130.94H530.93V129.34H538.81ZM476.38 148V141.61C476.633 141.063 476.943 140.648 477.31 140.365C477.677 140.082 478.117 139.94 478.63 139.94C478.917 139.94 479.147 139.963 479.32 140.01C479.493 140.057 479.63 140.08 479.73 140.08C479.843 140.08 479.933 140.055 480 140.005C480.067 139.955 480.11 139.87 480.13 139.75L480.45 137.9C480.117 137.667 479.697 137.55 479.19 137.55C478.577 137.55 478.023 137.733 477.53 138.1C477.037 138.467 476.607 138.967 476.24 139.6L476.09 138.36C476.057 138.133 475.99 137.973 475.89 137.88C475.79 137.787 475.613 137.74 475.36 137.74H473.91V148H476.38ZM486.285 148.055C485.988 148.125 485.67 148.16 485.33 148.16C484.77 148.16 484.275 148.065 483.845 147.875C483.415 147.685 483.053 147.417 482.76 147.07C482.467 146.723 482.245 146.312 482.095 145.835C481.945 145.358 481.87 144.833 481.87 144.26V137.74H484.34V144.26C484.34 144.887 484.485 145.372 484.775 145.715C485.065 146.058 485.5 146.23 486.08 146.23C486.507 146.23 486.907 146.135 487.28 145.945C487.653 145.755 488.007 145.493 488.34 145.16V137.74H490.81V148H489.3C488.98 148 488.77 147.85 488.67 147.55L488.5 146.73C488.287 146.943 488.067 147.138 487.84 147.315C487.613 147.492 487.372 147.642 487.115 147.765C486.858 147.888 486.582 147.985 486.285 148.055ZM496.08 148V133.14H493.61V148H496.08ZM504.345 148.07C503.962 148.123 503.59 148.15 503.23 148.15C502.517 148.15 501.853 148.032 501.24 147.795C500.627 147.558 500.093 147.208 499.64 146.745C499.187 146.282 498.83 145.708 498.57 145.025C498.31 144.342 498.18 143.55 498.18 142.65C498.18 141.95 498.293 141.292 498.52 140.675C498.747 140.058 499.072 139.522 499.495 139.065C499.918 138.608 500.435 138.247 501.045 137.98C501.655 137.713 502.343 137.58 503.11 137.58C503.757 137.58 504.352 137.683 504.895 137.89C505.438 138.097 505.907 138.398 506.3 138.795C506.693 139.192 507 139.678 507.22 140.255C507.44 140.832 507.55 141.49 507.55 142.23C507.55 142.417 507.542 142.572 507.525 142.695C507.508 142.818 507.478 142.915 507.435 142.985C507.392 143.055 507.333 143.105 507.26 143.135C507.187 143.165 507.093 143.18 506.98 143.18H500.64C500.713 144.233 500.997 145.007 501.49 145.5C501.983 145.993 502.637 146.24 503.45 146.24C503.85 146.24 504.195 146.193 504.485 146.1C504.775 146.007 505.028 145.903 505.245 145.79C505.462 145.677 505.652 145.573 505.815 145.48C505.978 145.387 506.137 145.34 506.29 145.34C506.39 145.34 506.477 145.36 506.55 145.4C506.623 145.44 506.687 145.497 506.74 145.57L507.46 146.47C507.187 146.79 506.88 147.058 506.54 147.275C506.2 147.492 505.845 147.665 505.475 147.795C505.105 147.925 504.728 148.017 504.345 148.07ZM505.33 141.69H500.69C500.803 140.943 501.063 140.367 501.47 139.96C501.877 139.553 502.44 139.35 503.16 139.35C503.527 139.35 503.847 139.412 504.12 139.535C504.393 139.658 504.62 139.827 504.8 140.04C504.98 140.253 505.113 140.502 505.2 140.785C505.287 141.068 505.33 141.37 505.33 141.69ZM512.03 141.61V148H509.56V137.74H511.01C511.263 137.74 511.44 137.787 511.54 137.88C511.64 137.973 511.707 138.133 511.74 138.36L511.89 139.6C512.257 138.967 512.687 138.467 513.18 138.1C513.673 137.733 514.227 137.55 514.84 137.55C515.347 137.55 515.767 137.667 516.1 137.9L515.78 139.75C515.76 139.87 515.717 139.955 515.65 140.005C515.583 140.055 515.493 140.08 515.38 140.08C515.28 140.08 515.143 140.057 514.97 140.01C514.797 139.963 514.567 139.94 514.28 139.94C513.767 139.94 513.327 140.082 512.96 140.365C512.593 140.648 512.283 141.063 512.03 141.61Z" fill="#3D4251"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M576.615 120.14V135H574.145V120.14H576.615ZM584.88 135.07C584.497 135.123 584.125 135.15 583.765 135.15C583.052 135.15 582.388 135.032 581.775 134.795C581.162 134.558 580.628 134.208 580.175 133.745C579.722 133.282 579.365 132.708 579.105 132.025C578.845 131.342 578.715 130.55 578.715 129.65C578.715 128.95 578.828 128.292 579.055 127.675C579.282 127.058 579.607 126.522 580.03 126.065C580.453 125.608 580.97 125.247 581.58 124.98C582.19 124.713 582.878 124.58 583.645 124.58C584.292 124.58 584.887 124.683 585.43 124.89C585.973 125.097 586.442 125.398 586.835 125.795C587.228 126.192 587.535 126.678 587.755 127.255C587.975 127.832 588.085 128.49 588.085 129.23C588.085 129.417 588.077 129.572 588.06 129.695C588.043 129.818 588.013 129.915 587.97 129.985C587.927 130.055 587.868 130.105 587.795 130.135C587.722 130.165 587.628 130.18 587.515 130.18H581.175C581.248 131.233 581.532 132.007 582.025 132.5C582.518 132.993 583.172 133.24 583.985 133.24C584.385 133.24 584.73 133.193 585.02 133.1C585.31 133.007 585.563 132.903 585.78 132.79C585.997 132.677 586.187 132.573 586.35 132.48C586.513 132.387 586.672 132.34 586.825 132.34C586.925 132.34 587.012 132.36 587.085 132.4C587.158 132.44 587.222 132.497 587.275 132.57L587.995 133.47C587.722 133.79 587.415 134.058 587.075 134.275C586.735 134.492 586.38 134.665 586.01 134.795C585.64 134.925 585.263 135.017 584.88 135.07ZM585.865 128.69H581.225C581.338 127.943 581.598 127.367 582.005 126.96C582.412 126.553 582.975 126.35 583.695 126.35C584.062 126.35 584.382 126.412 584.655 126.535C584.928 126.658 585.155 126.827 585.335 127.04C585.515 127.253 585.648 127.502 585.735 127.785C585.822 128.068 585.865 128.37 585.865 128.69ZM592.565 127.49V135H590.095V124.74H591.605C591.925 124.74 592.135 124.89 592.235 125.19L592.395 125.95C592.575 125.75 592.763 125.567 592.96 125.4C593.157 125.233 593.367 125.09 593.59 124.97C593.813 124.85 594.053 124.755 594.31 124.685C594.567 124.615 594.848 124.58 595.155 124.58C595.802 124.58 596.333 124.755 596.75 125.105C597.167 125.455 597.478 125.92 597.685 126.5C597.845 126.16 598.045 125.868 598.285 125.625C598.525 125.382 598.788 125.183 599.075 125.03C599.362 124.877 599.667 124.763 599.99 124.69C600.313 124.617 600.638 124.58 600.965 124.58C601.532 124.58 602.035 124.667 602.475 124.84C602.915 125.013 603.285 125.267 603.585 125.6C603.885 125.933 604.113 126.34 604.27 126.82C604.427 127.3 604.505 127.85 604.505 128.47V135H602.035V128.47C602.035 127.817 601.892 127.325 601.605 126.995C601.318 126.665 600.898 126.5 600.345 126.5C600.092 126.5 599.857 126.543 599.64 126.63C599.423 126.717 599.233 126.842 599.07 127.005C598.907 127.168 598.778 127.373 598.685 127.62C598.592 127.867 598.545 128.15 598.545 128.47V135H596.065V128.47C596.065 127.783 595.927 127.283 595.65 126.97C595.373 126.657 594.965 126.5 594.425 126.5C594.072 126.5 593.74 126.588 593.43 126.765C593.12 126.942 592.832 127.183 592.565 127.49ZM609.405 135V127.49C609.672 127.183 609.96 126.942 610.27 126.765C610.58 126.588 610.912 126.5 611.265 126.5C611.805 126.5 612.213 126.657 612.49 126.97C612.767 127.283 612.905 127.783 612.905 128.47V135H615.385V128.47C615.385 128.15 615.432 127.867 615.525 127.62C615.618 127.373 615.747 127.168 615.91 127.005C616.073 126.842 616.263 126.717 616.48 126.63C616.697 126.543 616.932 126.5 617.185 126.5C617.738 126.5 618.158 126.665 618.445 126.995C618.732 127.325 618.875 127.817 618.875 128.47V135H621.345V128.47C621.345 127.85 621.267 127.3 621.11 126.82C620.953 126.34 620.725 125.933 620.425 125.6C620.125 125.267 619.755 125.013 619.315 124.84C618.875 124.667 618.372 124.58 617.805 124.58C617.478 124.58 617.153 124.617 616.83 124.69C616.507 124.763 616.202 124.877 615.915 125.03C615.628 125.183 615.365 125.382 615.125 125.625C614.885 125.868 614.685 126.16 614.525 126.5C614.318 125.92 614.007 125.455 613.59 125.105C613.173 124.755 612.642 124.58 611.995 124.58C611.688 124.58 611.407 124.615 611.15 124.685C610.893 124.755 610.653 124.85 610.43 124.97C610.207 125.09 609.997 125.233 609.8 125.4C609.603 125.567 609.415 125.75 609.235 125.95L609.075 125.19C608.975 124.89 608.765 124.74 608.445 124.74H606.935V135H609.405ZM627.245 135.07C626.945 135.13 626.612 135.16 626.245 135.16C625.812 135.16 625.412 135.102 625.045 134.985C624.678 134.868 624.363 134.693 624.1 134.46C623.837 134.227 623.632 133.937 623.485 133.59C623.338 133.243 623.265 132.84 623.265 132.38C623.265 131.993 623.367 131.612 623.57 131.235C623.773 130.858 624.112 130.518 624.585 130.215C625.058 129.912 625.688 129.66 626.475 129.46C627.262 129.26 628.238 129.147 629.405 129.12V128.52C629.405 127.833 629.26 127.325 628.97 126.995C628.68 126.665 628.258 126.5 627.705 126.5C627.305 126.5 626.972 126.547 626.705 126.64C626.438 126.733 626.207 126.838 626.01 126.955C625.813 127.072 625.632 127.177 625.465 127.27C625.298 127.363 625.115 127.41 624.915 127.41C624.748 127.41 624.605 127.367 624.485 127.28C624.365 127.193 624.268 127.087 624.195 126.96L623.745 126.17C624.925 125.09 626.348 124.55 628.015 124.55C628.615 124.55 629.15 124.648 629.62 124.845C630.09 125.042 630.488 125.315 630.815 125.665C631.142 126.015 631.39 126.433 631.56 126.92C631.73 127.407 631.815 127.94 631.815 128.52V135H630.705C630.472 135 630.288 134.965 630.155 134.895C630.022 134.825 629.922 134.683 629.855 134.47L629.635 133.74C629.375 133.973 629.12 134.178 628.87 134.355C628.62 134.532 628.362 134.68 628.095 134.8C627.828 134.92 627.545 135.01 627.245 135.07ZM627.015 133.46C626.602 133.46 626.267 133.367 626.01 133.18C625.753 132.993 625.625 132.683 625.625 132.25C625.625 132.03 625.687 131.827 625.81 131.64C625.933 131.453 626.142 131.29 626.435 131.15C626.728 131.01 627.117 130.895 627.6 130.805C628.083 130.715 628.685 130.653 629.405 130.62V132.35C629.225 132.537 629.047 132.698 628.87 132.835C628.693 132.972 628.51 133.087 628.32 133.18C628.13 133.273 627.928 133.343 627.715 133.39C627.502 133.437 627.268 133.46 627.015 133.46ZM639.065 134.93C638.598 135.083 638.115 135.16 637.615 135.16C636.728 135.16 636.045 134.908 635.565 134.405C635.085 133.902 634.845 133.207 634.845 132.32V126.59H633.805C633.672 126.59 633.557 126.547 633.46 126.46C633.363 126.373 633.315 126.243 633.315 126.07V125.09L634.965 124.82L635.485 122.02C635.512 121.887 635.573 121.783 635.67 121.71C635.767 121.637 635.888 121.6 636.035 121.6H637.315V124.83H640.015V126.59H637.315V132.15C637.315 132.47 637.395 132.72 637.555 132.9C637.715 133.08 637.928 133.17 638.195 133.17C638.348 133.17 638.477 133.152 638.58 133.115C638.683 133.078 638.773 133.04 638.85 133C638.927 132.96 638.995 132.922 639.055 132.885C639.115 132.848 639.175 132.83 639.235 132.83C639.308 132.83 639.368 132.848 639.415 132.885C639.462 132.922 639.512 132.977 639.565 133.05L640.305 134.25C639.945 134.55 639.532 134.777 639.065 134.93ZM643.385 123.3C643.605 123.3 643.812 123.258 644.005 123.175C644.198 123.092 644.367 122.98 644.51 122.84C644.653 122.7 644.768 122.537 644.855 122.35C644.942 122.163 644.985 121.963 644.985 121.75C644.985 121.53 644.942 121.323 644.855 121.13C644.768 120.937 644.653 120.77 644.51 120.63C644.367 120.49 644.198 120.378 644.005 120.295C643.812 120.212 643.605 120.17 643.385 120.17C643.172 120.17 642.97 120.212 642.78 120.295C642.59 120.378 642.425 120.49 642.285 120.63C642.145 120.77 642.033 120.937 641.95 121.13C641.867 121.323 641.825 121.53 641.825 121.75C641.825 121.963 641.867 122.163 641.95 122.35C642.033 122.537 642.145 122.7 642.285 122.84C642.425 122.98 642.59 123.092 642.78 123.175C642.97 123.258 643.172 123.3 643.385 123.3ZM644.615 124.74V135H642.145V124.74H644.615ZM654.685 135V133.1H649.625L654.505 126.68C654.592 126.567 654.667 126.428 654.73 126.265C654.793 126.102 654.825 125.933 654.825 125.76V124.74H647.045V126.63H652.005L647.085 133.12C646.972 133.26 646.888 133.408 646.835 133.565C646.782 133.722 646.755 133.857 646.755 133.97V135H654.685ZM662.27 135.07C661.887 135.123 661.515 135.15 661.155 135.15C660.442 135.15 659.778 135.032 659.165 134.795C658.552 134.558 658.018 134.208 657.565 133.745C657.112 133.282 656.755 132.708 656.495 132.025C656.235 131.342 656.105 130.55 656.105 129.65C656.105 128.95 656.218 128.292 656.445 127.675C656.672 127.058 656.997 126.522 657.42 126.065C657.843 125.608 658.36 125.247 658.97 124.98C659.58 124.713 660.268 124.58 661.035 124.58C661.682 124.58 662.277 124.683 662.82 124.89C663.363 125.097 663.832 125.398 664.225 125.795C664.618 126.192 664.925 126.678 665.145 127.255C665.365 127.832 665.475 128.49 665.475 129.23C665.475 129.417 665.467 129.572 665.45 129.695C665.433 129.818 665.403 129.915 665.36 129.985C665.317 130.055 665.258 130.105 665.185 130.135C665.112 130.165 665.018 130.18 664.905 130.18H658.565C658.638 131.233 658.922 132.007 659.415 132.5C659.908 132.993 660.562 133.24 661.375 133.24C661.775 133.24 662.12 133.193 662.41 133.1C662.7 133.007 662.953 132.903 663.17 132.79C663.387 132.677 663.577 132.573 663.74 132.48C663.903 132.387 664.062 132.34 664.215 132.34C664.315 132.34 664.402 132.36 664.475 132.4C664.548 132.44 664.612 132.497 664.665 132.57L665.385 133.47C665.112 133.79 664.805 134.058 664.465 134.275C664.125 134.492 663.77 134.665 663.4 134.795C663.03 134.925 662.653 135.017 662.27 135.07ZM663.255 128.69H658.615C658.728 127.943 658.988 127.367 659.395 126.96C659.802 126.553 660.365 126.35 661.085 126.35C661.452 126.35 661.772 126.412 662.045 126.535C662.318 126.658 662.545 126.827 662.725 127.04C662.905 127.253 663.038 127.502 663.125 127.785C663.212 128.068 663.255 128.37 663.255 128.69ZM669.955 128.61V135H667.485V124.74H668.935C669.188 124.74 669.365 124.787 669.465 124.88C669.565 124.973 669.632 125.133 669.665 125.36L669.815 126.6C670.182 125.967 670.612 125.467 671.105 125.1C671.598 124.733 672.152 124.55 672.765 124.55C673.272 124.55 673.692 124.667 674.025 124.9L673.705 126.75C673.685 126.87 673.642 126.955 673.575 127.005C673.508 127.055 673.418 127.08 673.305 127.08C673.205 127.08 673.068 127.057 672.895 127.01C672.722 126.963 672.492 126.94 672.205 126.94C671.692 126.94 671.252 127.082 670.885 127.365C670.518 127.648 670.208 128.063 669.955 128.61Z" fill="#3D4251"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M199.72 163.606C200.116 163.606 200.431 163.498 200.665 163.282C200.899 163.066 201.016 162.778 201.016 162.418C201.016 162.07 200.899 161.785 200.665 161.563C200.431 161.341 200.122 161.23 199.738 161.23H199.27C198.886 161.23 198.577 161.341 198.343 161.563C198.109 161.785 197.992 162.07 197.992 162.418C197.992 162.778 198.109 163.066 198.343 163.282C198.577 163.498 198.886 163.606 199.27 163.606H199.72ZM193.362 175V172.966H190.41C190.062 172.966 189.783 172.864 189.573 172.66C189.363 172.456 189.258 172.168 189.258 171.796V161.968H183.858V164.002H187.008V171.796C187.008 172.804 187.296 173.59 187.872 174.154C188.448 174.718 189.24 175 190.248 175H193.362ZM203.662 172.948V175H195.094V172.948H198.64V167.134H195.544V165.1H200.8V172.948H203.662ZM209.642 175.18C210.926 175.18 211.91 174.916 212.594 174.388C213.278 173.86 213.62 173.11 213.62 172.138C213.62 171.238 213.338 170.539 212.774 170.041C212.21 169.543 211.34 169.222 210.164 169.078L208.706 168.88C207.818 168.772 207.374 168.4 207.374 167.764C207.374 167.02 207.902 166.648 208.958 166.648H209.696C210.104 166.648 210.443 166.729 210.713 166.891C210.983 167.053 211.148 167.272 211.208 167.548H213.512C213.392 166.72 212.999 166.075 212.333 165.613C211.667 165.151 210.788 164.92 209.696 164.92H208.958C207.734 164.92 206.795 165.172 206.141 165.676C205.487 166.18 205.16 166.9 205.16 167.836C205.16 168.736 205.43 169.426 205.97 169.906C206.51 170.386 207.344 170.698 208.472 170.842L209.876 171.004C210.896 171.148 211.406 171.55 211.406 172.21C211.406 172.594 211.253 172.897 210.947 173.119C210.641 173.341 210.206 173.452 209.642 173.452H208.922C208.478 173.452 208.106 173.365 207.806 173.191C207.506 173.017 207.32 172.774 207.248 172.462H204.944C205.052 173.326 205.451 173.995 206.141 174.469C206.831 174.943 207.758 175.18 208.922 175.18H209.642ZM223.578 172.966V175H220.896C219.924 175 219.156 174.724 218.592 174.172C218.028 173.62 217.746 172.876 217.746 171.94V167.134H215.01V165.1H217.746V162.04H219.996V165.1H223.758V167.134H219.996V171.94C219.996 172.252 220.089 172.501 220.275 172.687C220.461 172.873 220.722 172.966 221.058 172.966H223.578ZM229.9 175.18C230.428 175.18 230.923 175.117 231.385 174.991C231.847 174.865 232.261 174.685 232.627 174.451C232.993 174.217 233.302 173.935 233.554 173.605C233.806 173.275 233.986 172.906 234.094 172.498H231.88C231.736 172.774 231.49 172.99 231.142 173.146C230.794 173.302 230.38 173.38 229.9 173.38C229.252 173.38 228.745 173.194 228.379 172.822C228.013 172.45 227.83 171.928 227.83 171.256V170.608H234.166V168.808C234.166 168.22 234.064 167.686 233.86 167.206C233.656 166.726 233.368 166.318 232.996 165.982C232.624 165.646 232.177 165.385 231.655 165.199C231.133 165.013 230.548 164.92 229.9 164.92C229.24 164.92 228.646 165.01 228.118 165.19C227.59 165.37 227.14 165.628 226.768 165.964C226.396 166.3 226.108 166.708 225.904 167.188C225.7 167.668 225.598 168.202 225.598 168.79V171.292C225.598 171.88 225.7 172.414 225.904 172.894C226.108 173.374 226.396 173.785 226.768 174.127C227.14 174.469 227.593 174.73 228.127 174.91C228.661 175.09 229.252 175.18 229.9 175.18ZM227.83 169.096H231.97V168.808C231.97 168.148 231.787 167.635 231.421 167.269C231.055 166.903 230.548 166.72 229.9 166.72C229.252 166.72 228.745 166.903 228.379 167.269C228.013 167.635 227.83 168.142 227.83 168.79V169.096ZM238.346 175V168.844C238.346 168.232 238.511 167.749 238.841 167.395C239.171 167.041 239.624 166.864 240.2 166.864C240.764 166.864 241.208 167.035 241.532 167.377C241.856 167.719 242.018 168.196 242.018 168.808V175H244.268V168.538C244.268 167.446 243.98 166.57 243.404 165.91C242.828 165.25 242.03 164.92 241.01 164.92C240.29 164.92 239.702 165.082 239.246 165.406C238.79 165.73 238.502 166.198 238.382 166.81H238.22V165.1H236.096V175H238.346ZM251.985 174.991C251.523 175.117 251.028 175.18 250.5 175.18C249.852 175.18 249.261 175.09 248.727 174.91C248.193 174.73 247.74 174.469 247.368 174.127C246.996 173.785 246.708 173.374 246.504 172.894C246.3 172.414 246.198 171.88 246.198 171.292V168.79C246.198 168.202 246.3 167.668 246.504 167.188C246.708 166.708 246.996 166.3 247.368 165.964C247.74 165.628 248.19 165.37 248.718 165.19C249.246 165.01 249.84 164.92 250.5 164.92C251.148 164.92 251.733 165.013 252.255 165.199C252.777 165.385 253.224 165.646 253.596 165.982C253.968 166.318 254.256 166.726 254.46 167.206C254.664 167.686 254.766 168.22 254.766 168.808V170.608H248.43V171.256C248.43 171.928 248.613 172.45 248.979 172.822C249.345 173.194 249.852 173.38 250.5 173.38C250.98 173.38 251.394 173.302 251.742 173.146C252.09 172.99 252.336 172.774 252.48 172.498H254.694C254.586 172.906 254.406 173.275 254.154 173.605C253.902 173.935 253.593 174.217 253.227 174.451C252.861 174.685 252.447 174.865 251.985 174.991ZM252.57 169.096H248.43V168.79C248.43 168.142 248.613 167.635 248.979 167.269C249.345 166.903 249.852 166.72 250.5 166.72C251.148 166.72 251.655 166.903 252.021 167.269C252.387 167.635 252.57 168.148 252.57 168.808V169.096ZM259.36 168.844V175H257.11V165.1H259.234V166.81H259.396C259.516 166.21 259.816 165.745 260.296 165.415C260.776 165.085 261.388 164.92 262.132 164.92C263.2 164.92 264.034 165.25 264.634 165.91C265.234 166.57 265.534 167.482 265.534 168.646V169.366H263.176V168.844C263.176 168.208 263.002 167.713 262.654 167.359C262.306 167.005 261.826 166.828 261.214 166.828C260.638 166.828 260.185 167.008 259.855 167.368C259.525 167.728 259.36 168.22 259.36 168.844Z" fill="white"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M326.72 163.606C327.116 163.606 327.431 163.498 327.665 163.282C327.899 163.066 328.016 162.778 328.016 162.418C328.016 162.07 327.899 161.785 327.665 161.563C327.431 161.341 327.122 161.23 326.738 161.23H326.27C325.886 161.23 325.577 161.341 325.343 161.563C325.109 161.785 324.992 162.07 324.992 162.418C324.992 162.778 325.109 163.066 325.343 163.282C325.577 163.498 325.886 163.606 326.27 163.606H326.72ZM320.362 175V172.966H317.41C317.062 172.966 316.783 172.864 316.573 172.66C316.363 172.456 316.258 172.168 316.258 171.796V161.968H310.858V164.002H314.008V171.796C314.008 172.804 314.296 173.59 314.872 174.154C315.448 174.718 316.24 175 317.248 175H320.362ZM330.662 172.948V175H322.094V172.948H325.64V167.134H322.544V165.1H327.8V172.948H330.662ZM336.642 175.18C337.926 175.18 338.91 174.916 339.594 174.388C340.278 173.86 340.62 173.11 340.62 172.138C340.62 171.238 340.338 170.539 339.774 170.041C339.21 169.543 338.34 169.222 337.164 169.078L335.706 168.88C334.818 168.772 334.374 168.4 334.374 167.764C334.374 167.02 334.902 166.648 335.958 166.648H336.696C337.104 166.648 337.443 166.729 337.713 166.891C337.983 167.053 338.148 167.272 338.208 167.548H340.512C340.392 166.72 339.999 166.075 339.333 165.613C338.667 165.151 337.788 164.92 336.696 164.92H335.958C334.734 164.92 333.795 165.172 333.141 165.676C332.487 166.18 332.16 166.9 332.16 167.836C332.16 168.736 332.43 169.426 332.97 169.906C333.51 170.386 334.344 170.698 335.472 170.842L336.876 171.004C337.896 171.148 338.406 171.55 338.406 172.21C338.406 172.594 338.253 172.897 337.947 173.119C337.641 173.341 337.206 173.452 336.642 173.452H335.922C335.478 173.452 335.106 173.365 334.806 173.191C334.506 173.017 334.32 172.774 334.248 172.462H331.944C332.052 173.326 332.451 173.995 333.141 174.469C333.831 174.943 334.758 175.18 335.922 175.18H336.642ZM350.578 172.966V175H347.896C346.924 175 346.156 174.724 345.592 174.172C345.028 173.62 344.746 172.876 344.746 171.94V167.134H342.01V165.1H344.746V162.04H346.996V165.1H350.758V167.134H346.996V171.94C346.996 172.252 347.089 172.501 347.275 172.687C347.461 172.873 347.722 172.966 348.058 172.966H350.578ZM356.9 175.18C357.428 175.18 357.923 175.117 358.385 174.991C358.847 174.865 359.261 174.685 359.627 174.451C359.993 174.217 360.302 173.935 360.554 173.605C360.806 173.275 360.986 172.906 361.094 172.498H358.88C358.736 172.774 358.49 172.99 358.142 173.146C357.794 173.302 357.38 173.38 356.9 173.38C356.252 173.38 355.745 173.194 355.379 172.822C355.013 172.45 354.83 171.928 354.83 171.256V170.608H361.166V168.808C361.166 168.22 361.064 167.686 360.86 167.206C360.656 166.726 360.368 166.318 359.996 165.982C359.624 165.646 359.177 165.385 358.655 165.199C358.133 165.013 357.548 164.92 356.9 164.92C356.24 164.92 355.646 165.01 355.118 165.19C354.59 165.37 354.14 165.628 353.768 165.964C353.396 166.3 353.108 166.708 352.904 167.188C352.7 167.668 352.598 168.202 352.598 168.79V171.292C352.598 171.88 352.7 172.414 352.904 172.894C353.108 173.374 353.396 173.785 353.768 174.127C354.14 174.469 354.593 174.73 355.127 174.91C355.661 175.09 356.252 175.18 356.9 175.18ZM354.83 169.096H358.97V168.808C358.97 168.148 358.787 167.635 358.421 167.269C358.055 166.903 357.548 166.72 356.9 166.72C356.252 166.72 355.745 166.903 355.379 167.269C355.013 167.635 354.83 168.142 354.83 168.79V169.096ZM365.346 175V168.844C365.346 168.232 365.511 167.749 365.841 167.395C366.171 167.041 366.624 166.864 367.2 166.864C367.764 166.864 368.208 167.035 368.532 167.377C368.856 167.719 369.018 168.196 369.018 168.808V175H371.268V168.538C371.268 167.446 370.98 166.57 370.404 165.91C369.828 165.25 369.03 164.92 368.01 164.92C367.29 164.92 366.702 165.082 366.246 165.406C365.79 165.73 365.502 166.198 365.382 166.81H365.22V165.1H363.096V175H365.346ZM378.985 174.991C378.523 175.117 378.028 175.18 377.5 175.18C376.852 175.18 376.261 175.09 375.727 174.91C375.193 174.73 374.74 174.469 374.368 174.127C373.996 173.785 373.708 173.374 373.504 172.894C373.3 172.414 373.198 171.88 373.198 171.292V168.79C373.198 168.202 373.3 167.668 373.504 167.188C373.708 166.708 373.996 166.3 374.368 165.964C374.74 165.628 375.19 165.37 375.718 165.19C376.246 165.01 376.84 164.92 377.5 164.92C378.148 164.92 378.733 165.013 379.255 165.199C379.777 165.385 380.224 165.646 380.596 165.982C380.968 166.318 381.256 166.726 381.46 167.206C381.664 167.686 381.766 168.22 381.766 168.808V170.608H375.43V171.256C375.43 171.928 375.613 172.45 375.979 172.822C376.345 173.194 376.852 173.38 377.5 173.38C377.98 173.38 378.394 173.302 378.742 173.146C379.09 172.99 379.336 172.774 379.48 172.498H381.694C381.586 172.906 381.406 173.275 381.154 173.605C380.902 173.935 380.593 174.217 380.227 174.451C379.861 174.685 379.447 174.865 378.985 174.991ZM379.57 169.096H375.43V168.79C375.43 168.142 375.613 167.635 375.979 167.269C376.345 166.903 376.852 166.72 377.5 166.72C378.148 166.72 378.655 166.903 379.021 167.269C379.387 167.635 379.57 168.148 379.57 168.808V169.096ZM386.36 168.844V175H384.11V165.1H386.234V166.81H386.396C386.516 166.21 386.816 165.745 387.296 165.415C387.776 165.085 388.388 164.92 389.132 164.92C390.2 164.92 391.034 165.25 391.634 165.91C392.234 166.57 392.534 167.482 392.534 168.646V169.366H390.176V168.844C390.176 168.208 390.002 167.713 389.654 167.359C389.306 167.005 388.826 166.828 388.214 166.828C387.638 166.828 387.185 167.008 386.855 167.368C386.525 167.728 386.36 168.22 386.36 168.844Z" fill="white"/>
+<rect x="560.75" y="157.75" width="100.5" height="23.5" rx="10.25" fill="#3D4251" stroke="#3D4251" stroke-width="3.5"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M586.72 163.606C587.116 163.606 587.431 163.498 587.665 163.282C587.899 163.066 588.016 162.778 588.016 162.418C588.016 162.07 587.899 161.785 587.665 161.563C587.431 161.341 587.122 161.23 586.738 161.23H586.27C585.886 161.23 585.577 161.341 585.343 161.563C585.109 161.785 584.992 162.07 584.992 162.418C584.992 162.778 585.109 163.066 585.343 163.282C585.577 163.498 585.886 163.606 586.27 163.606H586.72ZM580.362 175V172.966H577.41C577.062 172.966 576.783 172.864 576.573 172.66C576.363 172.456 576.258 172.168 576.258 171.796V161.968H570.858V164.002H574.008V171.796C574.008 172.804 574.296 173.59 574.872 174.154C575.448 174.718 576.24 175 577.248 175H580.362ZM590.662 172.948V175H582.094V172.948H585.64V167.134H582.544V165.1H587.8V172.948H590.662ZM596.642 175.18C597.926 175.18 598.91 174.916 599.594 174.388C600.278 173.86 600.62 173.11 600.62 172.138C600.62 171.238 600.338 170.539 599.774 170.041C599.21 169.543 598.34 169.222 597.164 169.078L595.706 168.88C594.818 168.772 594.374 168.4 594.374 167.764C594.374 167.02 594.902 166.648 595.958 166.648H596.696C597.104 166.648 597.443 166.729 597.713 166.891C597.983 167.053 598.148 167.272 598.208 167.548H600.512C600.392 166.72 599.999 166.075 599.333 165.613C598.667 165.151 597.788 164.92 596.696 164.92H595.958C594.734 164.92 593.795 165.172 593.141 165.676C592.487 166.18 592.16 166.9 592.16 167.836C592.16 168.736 592.43 169.426 592.97 169.906C593.51 170.386 594.344 170.698 595.472 170.842L596.876 171.004C597.896 171.148 598.406 171.55 598.406 172.21C598.406 172.594 598.253 172.897 597.947 173.119C597.641 173.341 597.206 173.452 596.642 173.452H595.922C595.478 173.452 595.106 173.365 594.806 173.191C594.506 173.017 594.32 172.774 594.248 172.462H591.944C592.052 173.326 592.451 173.995 593.141 174.469C593.831 174.943 594.758 175.18 595.922 175.18H596.642ZM610.578 172.966V175H607.896C606.924 175 606.156 174.724 605.592 174.172C605.028 173.62 604.746 172.876 604.746 171.94V167.134H602.01V165.1H604.746V162.04H606.996V165.1H610.758V167.134H606.996V171.94C606.996 172.252 607.089 172.501 607.275 172.687C607.461 172.873 607.722 172.966 608.058 172.966H610.578ZM616.9 175.18C617.428 175.18 617.923 175.117 618.385 174.991C618.847 174.865 619.261 174.685 619.627 174.451C619.993 174.217 620.302 173.935 620.554 173.605C620.806 173.275 620.986 172.906 621.094 172.498H618.88C618.736 172.774 618.49 172.99 618.142 173.146C617.794 173.302 617.38 173.38 616.9 173.38C616.252 173.38 615.745 173.194 615.379 172.822C615.013 172.45 614.83 171.928 614.83 171.256V170.608H621.166V168.808C621.166 168.22 621.064 167.686 620.86 167.206C620.656 166.726 620.368 166.318 619.996 165.982C619.624 165.646 619.177 165.385 618.655 165.199C618.133 165.013 617.548 164.92 616.9 164.92C616.24 164.92 615.646 165.01 615.118 165.19C614.59 165.37 614.14 165.628 613.768 165.964C613.396 166.3 613.108 166.708 612.904 167.188C612.7 167.668 612.598 168.202 612.598 168.79V171.292C612.598 171.88 612.7 172.414 612.904 172.894C613.108 173.374 613.396 173.785 613.768 174.127C614.14 174.469 614.593 174.73 615.127 174.91C615.661 175.09 616.252 175.18 616.9 175.18ZM614.83 169.096H618.97V168.808C618.97 168.148 618.787 167.635 618.421 167.269C618.055 166.903 617.548 166.72 616.9 166.72C616.252 166.72 615.745 166.903 615.379 167.269C615.013 167.635 614.83 168.142 614.83 168.79V169.096ZM625.346 175V168.844C625.346 168.232 625.511 167.749 625.841 167.395C626.171 167.041 626.624 166.864 627.2 166.864C627.764 166.864 628.208 167.035 628.532 167.377C628.856 167.719 629.018 168.196 629.018 168.808V175H631.268V168.538C631.268 167.446 630.98 166.57 630.404 165.91C629.828 165.25 629.03 164.92 628.01 164.92C627.29 164.92 626.702 165.082 626.246 165.406C625.79 165.73 625.502 166.198 625.382 166.81H625.22V165.1H623.096V175H625.346ZM638.985 174.991C638.523 175.117 638.028 175.18 637.5 175.18C636.852 175.18 636.261 175.09 635.727 174.91C635.193 174.73 634.74 174.469 634.368 174.127C633.996 173.785 633.708 173.374 633.504 172.894C633.3 172.414 633.198 171.88 633.198 171.292V168.79C633.198 168.202 633.3 167.668 633.504 167.188C633.708 166.708 633.996 166.3 634.368 165.964C634.74 165.628 635.19 165.37 635.718 165.19C636.246 165.01 636.84 164.92 637.5 164.92C638.148 164.92 638.733 165.013 639.255 165.199C639.777 165.385 640.224 165.646 640.596 165.982C640.968 166.318 641.256 166.726 641.46 167.206C641.664 167.686 641.766 168.22 641.766 168.808V170.608H635.43V171.256C635.43 171.928 635.613 172.45 635.979 172.822C636.345 173.194 636.852 173.38 637.5 173.38C637.98 173.38 638.394 173.302 638.742 173.146C639.09 172.99 639.336 172.774 639.48 172.498H641.694C641.586 172.906 641.406 173.275 641.154 173.605C640.902 173.935 640.593 174.217 640.227 174.451C639.861 174.685 639.447 174.865 638.985 174.991ZM639.57 169.096H635.43V168.79C635.43 168.142 635.613 167.635 635.979 167.269C636.345 166.903 636.852 166.72 637.5 166.72C638.148 166.72 638.655 166.903 639.021 167.269C639.387 167.635 639.57 168.148 639.57 168.808V169.096ZM646.36 168.844V175H644.11V165.1H646.234V166.81H646.396C646.516 166.21 646.816 165.745 647.296 165.415C647.776 165.085 648.388 164.92 649.132 164.92C650.2 164.92 651.034 165.25 651.634 165.91C652.234 166.57 652.534 167.482 652.534 168.646V169.366H650.176V168.844C650.176 168.208 650.002 167.713 649.654 167.359C649.306 167.005 648.826 166.828 648.214 166.828C647.638 166.828 647.185 167.008 646.855 167.368C646.525 167.728 646.36 168.22 646.36 168.844Z" fill="white"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M672 90H795.512L817 130.855L795.512 170H672L693.488 130.95L672 90Z" fill="#CCBDFE" stroke="#5E3AD7" stroke-width="3.75" stroke-linejoin="round"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M817 131L795.512 170H672L693.488 131.095L817 131Z" fill="#B39DFF" stroke="#5E3AD7" stroke-width="3.75" stroke-linejoin="round"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M755.419 113.012V121H752.331V108.175H754.144C754.46 108.175 754.681 108.233 754.806 108.35C754.931 108.467 755.015 108.667 755.056 108.95L755.244 110.5C755.702 109.708 756.24 109.083 756.856 108.625C757.473 108.167 758.165 107.938 758.931 107.938C759.565 107.938 760.09 108.083 760.506 108.375L760.106 110.688C760.081 110.838 760.027 110.944 759.944 111.006C759.86 111.069 759.748 111.1 759.606 111.1C759.481 111.1 759.31 111.071 759.094 111.013C758.877 110.954 758.59 110.925 758.231 110.925C757.59 110.925 757.04 111.102 756.581 111.456C756.123 111.81 755.735 112.329 755.419 113.012ZM727.844 111.725V121H724.756V108.175H726.644C727.044 108.175 727.306 108.362 727.431 108.738L727.644 109.75C727.902 109.483 728.175 109.242 728.462 109.025C728.75 108.808 729.054 108.621 729.375 108.463C729.696 108.304 730.04 108.183 730.406 108.1C730.773 108.017 731.173 107.975 731.606 107.975C732.306 107.975 732.927 108.094 733.469 108.331C734.01 108.569 734.462 108.902 734.825 109.331C735.187 109.76 735.462 110.273 735.65 110.869C735.837 111.465 735.931 112.121 735.931 112.838V121H732.844V112.838C732.844 112.054 732.662 111.448 732.3 111.019C731.937 110.59 731.394 110.375 730.669 110.375C730.135 110.375 729.635 110.496 729.169 110.737C728.702 110.979 728.26 111.308 727.844 111.725ZM745.812 121.088C745.333 121.154 744.869 121.188 744.419 121.188C743.527 121.188 742.698 121.04 741.931 120.744C741.165 120.448 740.498 120.01 739.931 119.431C739.365 118.852 738.919 118.135 738.594 117.281C738.269 116.427 738.106 115.438 738.106 114.312C738.106 113.438 738.248 112.615 738.531 111.844C738.815 111.073 739.221 110.402 739.75 109.831C740.279 109.26 740.925 108.808 741.687 108.475C742.45 108.142 743.31 107.975 744.269 107.975C745.077 107.975 745.821 108.104 746.5 108.362C747.179 108.621 747.765 108.998 748.256 109.494C748.748 109.99 749.131 110.598 749.406 111.319C749.681 112.04 749.819 112.863 749.819 113.787C749.819 114.021 749.808 114.215 749.787 114.369C749.767 114.523 749.729 114.644 749.675 114.731C749.621 114.819 749.548 114.881 749.456 114.919C749.365 114.956 749.248 114.975 749.106 114.975H741.181C741.273 116.292 741.627 117.258 742.244 117.875C742.86 118.492 743.677 118.8 744.694 118.8C745.194 118.8 745.625 118.742 745.987 118.625C746.35 118.508 746.667 118.379 746.937 118.238C747.208 118.096 747.446 117.967 747.65 117.85C747.854 117.733 748.052 117.675 748.244 117.675C748.369 117.675 748.477 117.7 748.569 117.75C748.66 117.8 748.74 117.871 748.806 117.963L749.706 119.088C749.365 119.488 748.981 119.823 748.556 120.094C748.131 120.365 747.687 120.581 747.225 120.744C746.762 120.906 746.292 121.021 745.812 121.088ZM747.044 113.113H741.244C741.385 112.179 741.71 111.458 742.219 110.95C742.727 110.442 743.431 110.188 744.331 110.188C744.79 110.188 745.19 110.265 745.531 110.419C745.873 110.573 746.156 110.783 746.381 111.05C746.606 111.317 746.773 111.627 746.881 111.981C746.99 112.335 747.044 112.713 747.044 113.113Z" fill="#3D4251"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M732.449 157V152.77H732.953C733.133 152.77 733.268 152.791 733.358 152.833C733.448 152.875 733.538 152.965 733.628 153.103L735.977 156.604C736.073 156.748 736.181 156.85 736.301 156.91C736.421 156.97 736.568 157 736.742 157H738.749L735.671 152.518C735.575 152.38 735.479 152.251 735.383 152.131C735.287 152.011 735.176 151.906 735.05 151.816C735.176 151.738 735.293 151.647 735.401 151.542C735.509 151.437 735.614 151.321 735.716 151.195L738.587 147.766H736.553C736.367 147.766 736.217 147.801 736.103 147.87C735.989 147.939 735.881 148.033 735.779 148.153L733.484 150.988C733.388 151.102 733.298 151.18 733.214 151.222C733.13 151.264 733.013 151.285 732.863 151.285H732.449V143.626H730.226V157H732.449ZM716.033 157.144C716.483 157.144 716.918 157.075 717.338 156.937C717.758 156.799 718.13 156.595 718.454 156.325L717.788 155.245C717.74 155.179 717.695 155.13 717.653 155.097C717.611 155.064 717.557 155.047 717.491 155.047C717.437 155.047 717.383 155.064 717.329 155.097C717.275 155.13 717.214 155.164 717.145 155.2C717.076 155.236 716.995 155.271 716.902 155.304C716.809 155.337 716.693 155.353 716.555 155.353C716.315 155.353 716.123 155.272 715.979 155.11C715.835 154.948 715.763 154.723 715.763 154.435V149.431H718.193V147.847H715.763V144.94H714.611C714.479 144.94 714.37 144.973 714.283 145.039C714.196 145.105 714.14 145.198 714.116 145.318L713.648 147.838L712.163 148.081V148.963C712.163 149.119 712.207 149.236 712.294 149.314C712.381 149.392 712.484 149.431 712.604 149.431H713.54V154.588C713.54 155.386 713.756 156.012 714.188 156.465C714.62 156.918 715.235 157.144 716.033 157.144ZM723.908 157.135C724.598 157.135 725.224 157.024 725.785 156.802C726.346 156.58 726.824 156.262 727.22 155.848C727.616 155.434 727.922 154.933 728.138 154.345C728.354 153.757 728.462 153.097 728.462 152.365C728.462 151.639 728.354 150.982 728.138 150.394C727.922 149.806 727.616 149.308 727.22 148.9C726.824 148.492 726.346 148.177 725.785 147.955C725.224 147.733 724.598 147.622 723.908 147.622C723.218 147.622 722.591 147.733 722.027 147.955C721.463 148.177 720.982 148.492 720.583 148.9C720.184 149.308 719.875 149.806 719.656 150.394C719.437 150.982 719.327 151.639 719.327 152.365C719.327 153.097 719.437 153.757 719.656 154.345C719.875 154.933 720.184 155.434 720.583 155.848C720.982 156.262 721.463 156.58 722.027 156.802C722.591 157.024 723.218 157.135 723.908 157.135ZM722.18 154.647C722.552 155.166 723.128 155.425 723.908 155.425C724.676 155.425 725.245 155.167 725.614 154.651C725.983 154.135 726.167 153.379 726.167 152.383C726.167 151.387 725.983 150.628 725.614 150.106C725.245 149.584 724.676 149.323 723.908 149.323C723.128 149.323 722.552 149.586 722.18 150.111C721.808 150.636 721.622 151.393 721.622 152.383C721.622 153.373 721.808 154.128 722.18 154.647ZM748.622 155.668V157H739.694V156.262C739.694 156.112 739.726 155.956 739.789 155.794C739.852 155.632 739.952 155.482 740.09 155.344L744.041 151.384C744.371 151.048 744.667 150.727 744.928 150.421C745.189 150.115 745.408 149.812 745.585 149.512C745.762 149.212 745.897 148.908 745.99 148.599C746.083 148.29 746.129 147.964 746.129 147.622C746.129 147.31 746.084 147.036 745.994 146.799C745.904 146.562 745.775 146.362 745.607 146.2C745.439 146.038 745.24 145.917 745.009 145.836C744.778 145.755 744.518 145.714 744.23 145.714C743.696 145.714 743.257 145.849 742.912 146.119C742.567 146.389 742.325 146.752 742.187 147.208C742.121 147.436 742.022 147.6 741.89 147.699C741.758 147.798 741.59 147.847 741.386 147.847C741.296 147.847 741.197 147.838 741.089 147.82L739.919 147.613C740.009 146.989 740.183 146.442 740.441 145.971C740.699 145.5 741.022 145.107 741.409 144.792C741.796 144.477 742.24 144.24 742.741 144.081C743.242 143.922 743.78 143.842 744.356 143.842C744.956 143.842 745.505 143.931 746.003 144.108C746.501 144.285 746.927 144.535 747.281 144.859C747.635 145.183 747.911 145.573 748.109 146.029C748.307 146.485 748.406 146.992 748.406 147.55C748.406 148.03 748.336 148.474 748.195 148.882C748.054 149.29 747.865 149.68 747.628 150.052C747.391 150.424 747.113 150.784 746.795 151.132C746.477 151.48 746.144 151.834 745.796 152.194L742.862 155.191C743.15 155.107 743.432 155.043 743.708 154.998C743.984 154.953 744.248 154.93 744.5 154.93H747.839C748.079 154.93 748.27 154.998 748.411 155.133C748.552 155.268 748.622 155.446 748.622 155.668ZM758.855 147.766H757.1C756.95 147.766 756.821 147.805 756.713 147.883C756.605 147.961 756.53 148.06 756.488 148.18L754.661 153.112C754.553 153.4 754.465 153.682 754.396 153.958C754.327 154.234 754.265 154.51 754.211 154.786C754.157 154.51 754.096 154.234 754.027 153.958C753.958 153.682 753.872 153.4 753.77 153.112L751.988 148.18C751.946 148.06 751.87 147.961 751.759 147.883C751.648 147.805 751.511 147.766 751.349 147.766H749.504L753.176 157H755.192L758.855 147.766ZM765.097 157.063C764.752 157.111 764.417 157.135 764.093 157.135C763.451 157.135 762.854 157.029 762.302 156.816C761.75 156.603 761.27 156.288 760.862 155.871C760.454 155.454 760.133 154.938 759.899 154.323C759.665 153.708 759.548 152.995 759.548 152.185C759.548 151.555 759.65 150.963 759.854 150.408C760.058 149.853 760.351 149.37 760.732 148.959C761.113 148.548 761.578 148.222 762.127 147.982C762.676 147.742 763.295 147.622 763.985 147.622C764.567 147.622 765.103 147.715 765.592 147.901C766.081 148.087 766.502 148.359 766.856 148.716C767.21 149.073 767.486 149.511 767.684 150.03C767.882 150.549 767.981 151.141 767.981 151.807C767.981 151.975 767.974 152.115 767.959 152.226C767.944 152.337 767.917 152.424 767.878 152.487C767.839 152.55 767.786 152.595 767.72 152.622C767.654 152.649 767.57 152.662 767.468 152.662H761.762C761.828 153.61 762.083 154.306 762.527 154.75C762.971 155.194 763.559 155.416 764.291 155.416C764.651 155.416 764.962 155.374 765.223 155.29C765.484 155.206 765.712 155.113 765.907 155.011C766.102 154.909 766.273 154.816 766.42 154.732C766.567 154.648 766.709 154.606 766.847 154.606C766.937 154.606 767.015 154.624 767.081 154.66C767.147 154.696 767.204 154.747 767.252 154.813L767.9 155.623C767.654 155.911 767.378 156.153 767.072 156.348C766.766 156.543 766.447 156.699 766.114 156.816C765.781 156.933 765.442 157.015 765.097 157.063ZM765.983 151.321H761.807C761.909 150.649 762.143 150.13 762.509 149.764C762.875 149.398 763.382 149.215 764.03 149.215C764.36 149.215 764.648 149.271 764.894 149.382C765.14 149.493 765.344 149.644 765.506 149.836C765.668 150.028 765.788 150.252 765.866 150.507C765.944 150.762 765.983 151.033 765.983 151.321ZM774.299 157.063C773.969 157.111 773.642 157.135 773.318 157.135C772.748 157.135 772.211 157.029 771.707 156.816C771.203 156.603 770.764 156.292 770.389 155.884C770.014 155.476 769.717 154.977 769.498 154.386C769.279 153.795 769.169 153.121 769.169 152.365C769.169 151.687 769.267 151.059 769.462 150.48C769.657 149.901 769.943 149.4 770.321 148.977C770.699 148.554 771.167 148.222 771.725 147.982C772.283 147.742 772.925 147.622 773.651 147.622C774.341 147.622 774.946 147.733 775.465 147.955C775.984 148.177 776.45 148.495 776.864 148.909L776.279 149.719C776.213 149.803 776.149 149.869 776.086 149.917C776.023 149.965 775.931 149.989 775.811 149.989C775.697 149.989 775.586 149.955 775.478 149.886C775.37 149.817 775.241 149.739 775.091 149.652C774.941 149.565 774.763 149.487 774.556 149.418C774.349 149.349 774.092 149.314 773.786 149.314C773.396 149.314 773.054 149.385 772.76 149.526C772.466 149.667 772.222 149.869 772.027 150.133C771.832 150.397 771.686 150.717 771.59 151.092C771.494 151.467 771.446 151.891 771.446 152.365C771.446 152.857 771.499 153.295 771.604 153.679C771.709 154.063 771.86 154.386 772.058 154.647C772.256 154.908 772.496 155.106 772.778 155.241C773.06 155.376 773.378 155.443 773.732 155.443C774.086 155.443 774.373 155.4 774.592 155.313C774.811 155.226 774.995 155.13 775.145 155.025C775.295 154.92 775.426 154.824 775.537 154.737C775.648 154.65 775.772 154.606 775.91 154.606C776.09 154.606 776.225 154.675 776.315 154.813L776.954 155.623C776.708 155.911 776.441 156.153 776.153 156.348C775.865 156.543 775.567 156.699 775.258 156.816C774.949 156.933 774.629 157.015 774.299 157.063Z" fill="#3D4251"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M473.5 31C474.605 31 475.5 30.1046 475.5 29V28C475.5 26.8954 474.605 26 473.5 26C472.395 26 471.5 26.8954 471.5 28V29C471.5 30.1046 472.395 31 473.5 31ZM473.5 34C474.605 34 475.5 34.8954 475.5 36V37C475.5 38.1046 474.605 39 473.5 39C472.395 39 471.5 38.1046 471.5 37V36C471.5 34.8954 472.395 34 473.5 34ZM475.5 44C475.5 42.8954 474.605 42 473.5 42C472.395 42 471.5 42.8954 471.5 44V45C471.5 46.1046 472.395 47 473.5 47C474.605 47 475.5 46.1046 475.5 45V44ZM473.5 50C474.605 50 475.5 50.8954 475.5 52V53C475.5 54.1046 474.605 55 473.5 55C472.395 55 471.5 54.1046 471.5 53V52C471.5 50.8954 472.395 50 473.5 50ZM475.5 60C475.5 58.8954 474.605 58 473.5 58C472.395 58 471.5 58.8954 471.5 60V61C471.5 62.1046 472.395 63 473.5 63C474.605 63 475.5 62.1046 475.5 61V60ZM473.5 71C474.605 71 475.5 70.1046 475.5 69V68C475.5 66.8954 474.605 66 473.5 66C472.395 66 471.5 66.8954 471.5 68V69C471.5 70.1046 472.395 71 473.5 71ZM465 26.5H466C467.105 26.5 468 27.3954 468 28.5C468 29.6046 467.105 30.5 466 30.5H465C463.895 30.5 463 29.6046 463 28.5C463 27.3954 463.895 26.5 465 26.5ZM458 26.5H457C455.895 26.5 455 27.3954 455 28.5C455 29.6046 455.895 30.5 457 30.5H458C459.105 30.5 460 29.6046 460 28.5C460 27.3954 459.105 26.5 458 26.5ZM449 26.5H450C451.105 26.5 452 27.3954 452 28.5C452 29.6046 451.105 30.5 450 30.5H449C447.895 30.5 447 29.6046 447 28.5C447 27.3954 447.895 26.5 449 26.5ZM442 26.5H441C439.895 26.5 439 27.3954 439 28.5C439 29.6046 439.895 30.5 441 30.5H442C443.105 30.5 444 29.6046 444 28.5C444 27.3954 443.105 26.5 442 26.5ZM433 26.5H434C435.105 26.5 436 27.3954 436 28.5C436 29.6046 435.105 30.5 434 30.5H433C431.895 30.5 431 29.6046 431 28.5C431 27.3954 431.895 26.5 433 26.5ZM426 26.5H425C423.895 26.5 423 27.3954 423 28.5C423 29.6046 423.895 30.5 425 30.5H426C427.105 30.5 428 29.6046 428 28.5C428 27.3954 427.105 26.5 426 26.5ZM417 26.5H418C419.105 26.5 420 27.3954 420 28.5C420 29.6046 419.105 30.5 418 30.5H417C415.895 30.5 415 29.6046 415 28.5C415 27.3954 415.895 26.5 417 26.5ZM410 26.5H409C407.895 26.5 407 27.3954 407 28.5C407 29.6046 407.895 30.5 409 30.5H410C411.105 30.5 412 29.6046 412 28.5C412 27.3954 411.105 26.5 410 26.5ZM401 26.5H402C403.105 26.5 404 27.3954 404 28.5C404 29.6046 403.105 30.5 402 30.5H401C399.895 30.5 399 29.6046 399 28.5C399 27.3954 399.895 26.5 401 26.5ZM394 26.5H393C391.895 26.5 391 27.3954 391 28.5C391 29.6046 391.895 30.5 393 30.5H394C395.105 30.5 396 29.6046 396 28.5C396 27.3954 395.105 26.5 394 26.5ZM385 26.5H386C387.105 26.5 388 27.3954 388 28.5C388 29.6046 387.105 30.5 386 30.5H385C383.895 30.5 383 29.6046 383 28.5C383 27.3954 383.895 26.5 385 26.5ZM378 26.5H377C375.895 26.5 375 27.3954 375 28.5C375 29.6046 375.895 30.5 377 30.5H378C379.105 30.5 380 29.6046 380 28.5C380 27.3954 379.105 26.5 378 26.5ZM369 26.5H370C371.105 26.5 372 27.3954 372 28.5C372 29.6046 371.105 30.5 370 30.5H369C367.895 30.5 367 29.6046 367 28.5C367 27.3954 367.895 26.5 369 26.5ZM362 26.5H361C359.895 26.5 359 27.3954 359 28.5C359 29.6046 359.895 30.5 361 30.5H362C363.105 30.5 364 29.6046 364 28.5C364 27.3954 363.105 26.5 362 26.5ZM353 26.5H354C355.105 26.5 356 27.3954 356 28.5C356 29.6046 355.105 30.5 354 30.5H353C351.895 30.5 351 29.6046 351 28.5C351 27.3954 351.895 26.5 353 26.5ZM346 26.5H345C343.895 26.5 343 27.3954 343 28.5C343 29.6046 343.895 30.5 345 30.5H346C347.105 30.5 348 29.6046 348 28.5C348 27.3954 347.105 26.5 346 26.5ZM337 26.5H338C339.105 26.5 340 27.3954 340 28.5C340 29.6046 339.105 30.5 338 30.5H337C335.895 30.5 335 29.6046 335 28.5C335 27.3954 335.895 26.5 337 26.5ZM330 26.5H329C327.895 26.5 327 27.3954 327 28.5C327 29.6046 327.895 30.5 329 30.5H330C331.105 30.5 332 29.6046 332 28.5C332 27.3954 331.105 26.5 330 26.5ZM321 26.5H322C323.105 26.5 324 27.3954 324 28.5C324 29.6046 323.105 30.5 322 30.5H321C319.895 30.5 319 29.6046 319 28.5C319 27.3954 319.895 26.5 321 26.5ZM314 26.5H313C311.895 26.5 311 27.3954 311 28.5C311 29.6046 311.895 30.5 313 30.5H314C315.105 30.5 316 29.6046 316 28.5C316 27.3954 315.105 26.5 314 26.5ZM305 26.5H306C307.105 26.5 308 27.3954 308 28.5C308 29.6046 307.105 30.5 306 30.5H305C303.895 30.5 303 29.6046 303 28.5C303 27.3954 303.895 26.5 305 26.5ZM298 26.5H297C295.895 26.5 295 27.3954 295 28.5C295 29.6046 295.895 30.5 297 30.5H298C299.105 30.5 300 29.6046 300 28.5C300 27.3954 299.105 26.5 298 26.5ZM289 26.5H290C291.105 26.5 292 27.3954 292 28.5C292 29.6046 291.105 30.5 290 30.5H289C287.895 30.5 287 29.6046 287 28.5C287 27.3954 287.895 26.5 289 26.5ZM282 26.5H281C279.895 26.5 279 27.3954 279 28.5C279 29.6046 279.895 30.5 281 30.5H282C283.105 30.5 284 29.6046 284 28.5C284 27.3954 283.105 26.5 282 26.5ZM273 26.5H274C275.105 26.5 276 27.3954 276 28.5C276 29.6046 275.105 30.5 274 30.5H273C271.895 30.5 271 29.6046 271 28.5C271 27.3954 271.895 26.5 273 26.5ZM266 26.5H265C263.895 26.5 263 27.3954 263 28.5C263 29.6046 263.895 30.5 265 30.5H266C267.105 30.5 268 29.6046 268 28.5C268 27.3954 267.105 26.5 266 26.5ZM257 26.5H258C259.105 26.5 260 27.3954 260 28.5C260 29.6046 259.105 30.5 258 30.5H257C255.895 30.5 255 29.6046 255 28.5C255 27.3954 255.895 26.5 257 26.5ZM250 26.5H249C247.895 26.5 247 27.3954 247 28.5C247 29.6046 247.895 30.5 249 30.5H250C251.105 30.5 252 29.6046 252 28.5C252 27.3954 251.105 26.5 250 26.5ZM241 26.5H242C243.105 26.5 244 27.3954 244 28.5C244 29.6046 243.105 30.5 242 30.5H241C239.895 30.5 239 29.6046 239 28.5C239 27.3954 239.895 26.5 241 26.5ZM232 28.5V29.5C232 30.6046 232.895 31.5 234 31.5C235.105 31.5 236 30.6046 236 29.5V28.5C236 27.3954 235.105 26.5 234 26.5C232.895 26.5 232 27.3954 232 28.5ZM232 37.5V36.5C232 35.3954 232.895 34.5 234 34.5C235.105 34.5 236 35.3954 236 36.5V37.5C236 38.6046 235.105 39.5 234 39.5C232.895 39.5 232 38.6046 232 37.5ZM232 44.5V45.5C232 46.6046 232.895 47.5 234 47.5C235.105 47.5 236 46.6046 236 45.5V44.5C236 43.3954 235.105 42.5 234 42.5C232.895 42.5 232 43.3954 232 44.5ZM232 53.5V52.5C232 51.3954 232.895 50.5 234 50.5C235.105 50.5 236 51.3954 236 52.5V53.5C236 54.6046 235.105 55.5 234 55.5C232.895 55.5 232 54.6046 232 53.5ZM232 60.5V61.5C232 62.6046 232.895 63.5 234 63.5C235.105 63.5 236 62.6046 236 61.5V60.5C236 59.3954 235.105 58.5 234 58.5C232.895 58.5 232 59.3954 232 60.5ZM232 69.5V68.5C232 67.3954 232.895 66.5 234 66.5C235.105 66.5 236 67.3954 236 68.5V69.5C236 70.6046 235.105 71.5 234 71.5C232.895 71.5 232 70.6046 232 69.5Z" fill="url(#paint2_linear_202_2)"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M234 73L226 57H242L234 73Z" fill="#CDB217"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M614.5 29C614.5 30.1046 613.605 31 612.5 31C611.395 31 610.5 30.1046 610.5 29V28C610.5 26.8954 611.395 26 612.5 26C613.605 26 614.5 26.8954 614.5 28V29ZM612.5 39C613.605 39 614.5 38.1046 614.5 37V36C614.5 34.8954 613.605 34 612.5 34C611.395 34 610.5 34.8954 610.5 36V37C610.5 38.1046 611.395 39 612.5 39ZM614.5 45C614.5 46.1046 613.605 47 612.5 47C611.395 47 610.5 46.1046 610.5 45V44C610.5 42.8954 611.395 42 612.5 42C613.605 42 614.5 42.8954 614.5 44V45ZM612.5 55C613.605 55 614.5 54.1046 614.5 53V52C614.5 50.8954 613.605 50 612.5 50C611.395 50 610.5 50.8954 610.5 52V53C610.5 54.1046 611.395 55 612.5 55ZM614.5 61C614.5 62.1046 613.605 63 612.5 63C611.395 63 610.5 62.1046 610.5 61V60C610.5 58.8954 611.395 58 612.5 58C613.605 58 614.5 58.8954 614.5 60V61ZM612.5 66C613.605 66 614.5 66.8954 614.5 68V69C614.5 70.1046 613.605 71 612.5 71C611.395 71 610.5 70.1046 610.5 69V68C610.5 66.8954 611.395 66 612.5 66ZM605 26.5H604C602.895 26.5 602 27.3954 602 28.5C602 29.6046 602.895 30.5 604 30.5H605C606.105 30.5 607 29.6046 607 28.5C607 27.3954 606.105 26.5 605 26.5ZM596 26.5H597C598.105 26.5 599 27.3954 599 28.5C599 29.6046 598.105 30.5 597 30.5H596C594.895 30.5 594 29.6046 594 28.5C594 27.3954 594.895 26.5 596 26.5ZM589 26.5H588C586.895 26.5 586 27.3954 586 28.5C586 29.6046 586.895 30.5 588 30.5H589C590.105 30.5 591 29.6046 591 28.5C591 27.3954 590.105 26.5 589 26.5ZM580 26.5H581C582.105 26.5 583 27.3954 583 28.5C583 29.6046 582.105 30.5 581 30.5H580C578.895 30.5 578 29.6046 578 28.5C578 27.3954 578.895 26.5 580 26.5ZM573 26.5H572C570.895 26.5 570 27.3954 570 28.5C570 29.6046 570.895 30.5 572 30.5H573C574.105 30.5 575 29.6046 575 28.5C575 27.3954 574.105 26.5 573 26.5ZM564 26.5H565C566.105 26.5 567 27.3954 567 28.5C567 29.6046 566.105 30.5 565 30.5H564C562.895 30.5 562 29.6046 562 28.5C562 27.3954 562.895 26.5 564 26.5ZM557 26.5H556C554.895 26.5 554 27.3954 554 28.5C554 29.6046 554.895 30.5 556 30.5H557C558.105 30.5 559 29.6046 559 28.5C559 27.3954 558.105 26.5 557 26.5ZM548 26.5H549C550.105 26.5 551 27.3954 551 28.5C551 29.6046 550.105 30.5 549 30.5H548C546.895 30.5 546 29.6046 546 28.5C546 27.3954 546.895 26.5 548 26.5ZM541 26.5H540C538.895 26.5 538 27.3954 538 28.5C538 29.6046 538.895 30.5 540 30.5H541C542.105 30.5 543 29.6046 543 28.5C543 27.3954 542.105 26.5 541 26.5ZM532 26.5H533C534.105 26.5 535 27.3954 535 28.5C535 29.6046 534.105 30.5 533 30.5H532C530.895 30.5 530 29.6046 530 28.5C530 27.3954 530.895 26.5 532 26.5ZM525 26.5H524C522.895 26.5 522 27.3954 522 28.5C522 29.6046 522.895 30.5 524 30.5H525C526.105 30.5 527 29.6046 527 28.5C527 27.3954 526.105 26.5 525 26.5ZM516 26.5H517C518.105 26.5 519 27.3954 519 28.5C519 29.6046 518.105 30.5 517 30.5H516C514.895 30.5 514 29.6046 514 28.5C514 27.3954 514.895 26.5 516 26.5ZM509 26.5H508C506.895 26.5 506 27.3954 506 28.5C506 29.6046 506.895 30.5 508 30.5H509C510.105 30.5 511 29.6046 511 28.5C511 27.3954 510.105 26.5 509 26.5ZM500 26.5H501C502.105 26.5 503 27.3954 503 28.5C503 29.6046 502.105 30.5 501 30.5H500C498.895 30.5 498 29.6046 498 28.5C498 27.3954 498.895 26.5 500 26.5ZM491 28.5V29.5C491 30.6046 491.895 31.5 493 31.5C494.105 31.5 495 30.6046 495 29.5V28.5C495 27.3954 494.105 26.5 493 26.5C491.895 26.5 491 27.3954 491 28.5ZM491 37.5V36.5C491 35.3954 491.895 34.5 493 34.5C494.105 34.5 495 35.3954 495 36.5V37.5C495 38.6046 494.105 39.5 493 39.5C491.895 39.5 491 38.6046 491 37.5ZM491 44.5V45.5C491 46.6046 491.895 47.5 493 47.5C494.105 47.5 495 46.6046 495 45.5V44.5C495 43.3954 494.105 42.5 493 42.5C491.895 42.5 491 43.3954 491 44.5ZM491 53.5V52.5C491 51.3954 491.895 50.5 493 50.5C494.105 50.5 495 51.3954 495 52.5V53.5C495 54.6046 494.105 55.5 493 55.5C491.895 55.5 491 54.6046 491 53.5ZM491 60.5V61.5C491 62.6046 491.895 63.5 493 63.5C494.105 63.5 495 62.6046 495 61.5V60.5C495 59.3954 494.105 58.5 493 58.5C491.895 58.5 491 59.3954 491 60.5ZM491 69.5V68.5C491 67.3954 491.895 66.5 493 66.5C494.105 66.5 495 67.3954 495 68.5V69.5C495 70.6046 494.105 71.5 493 71.5C491.895 71.5 491 70.6046 491 69.5Z" fill="url(#paint3_linear_202_2)"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M493 73L485 57H501L493 73Z" fill="#3AD787"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M615.5 191V190C615.5 188.895 614.605 188 613.5 188C612.395 188 611.5 188.895 611.5 190V191C611.5 192.105 612.395 193 613.5 193C614.605 193 615.5 192.105 615.5 191ZM615.5 198V199C615.5 200.105 614.605 201 613.5 201C612.395 201 611.5 200.105 611.5 199V198C611.5 196.895 612.395 196 613.5 196C614.605 196 615.5 196.895 615.5 198ZM615.5 207V206C615.5 204.895 614.605 204 613.5 204C612.395 204 611.5 204.895 611.5 206V207C611.5 208.105 612.395 209 613.5 209C614.605 209 615.5 208.105 615.5 207ZM615.5 214V215C615.5 216.105 614.605 217 613.5 217C612.395 217 611.5 216.105 611.5 215V214C611.5 212.895 612.395 212 613.5 212C614.605 212 615.5 212.895 615.5 214ZM615.5 223V222C615.5 220.895 614.605 220 613.5 220C612.395 220 611.5 220.895 611.5 222V223C611.5 224.105 612.395 225 613.5 225C614.605 225 615.5 224.105 615.5 223ZM615.5 231V230C615.5 228.895 614.605 228 613.5 228C612.395 228 611.5 228.895 611.5 230V231C611.5 232.105 612.395 233 613.5 233C614.605 233 615.5 232.105 615.5 231ZM607 237H608C609.105 237 610 237.895 610 239C610 240.105 609.105 241 608 241H607C605.895 241 605 240.105 605 239C605 237.895 605.895 237 607 237ZM601 237H600C598.895 237 598 237.895 598 239C598 240.105 598.895 241 600 241H601C602.105 241 603 240.105 603 239C603 237.895 602.105 237 601 237ZM592 237H593C594.105 237 595 237.895 595 239C595 240.105 594.105 241 593 241H592C590.895 241 590 240.105 590 239C590 237.895 590.895 237 592 237ZM585 237H584C582.895 237 582 237.895 582 239C582 240.105 582.895 241 584 241H585C586.105 241 587 240.105 587 239C587 237.895 586.105 237 585 237ZM576 237H577C578.105 237 579 237.895 579 239C579 240.105 578.105 241 577 241H576C574.895 241 574 240.105 574 239C574 237.895 574.895 237 576 237ZM569 237H568C566.895 237 566 237.895 566 239C566 240.105 566.895 241 568 241H569C570.105 241 571 240.105 571 239C571 237.895 570.105 237 569 237ZM560 237H561C562.105 237 563 237.895 563 239C563 240.105 562.105 241 561 241H560C558.895 241 558 240.105 558 239C558 237.895 558.895 237 560 237ZM553 237H552C550.895 237 550 237.895 550 239C550 240.105 550.895 241 552 241H553C554.105 241 555 240.105 555 239C555 237.895 554.105 237 553 237ZM544 237H545C546.105 237 547 237.895 547 239C547 240.105 546.105 241 545 241H544C542.895 241 542 240.105 542 239C542 237.895 542.895 237 544 237ZM537 237H536C534.895 237 534 237.895 534 239C534 240.105 534.895 241 536 241H537C538.105 241 539 240.105 539 239C539 237.895 538.105 237 537 237ZM528 237H529C530.105 237 531 237.895 531 239C531 240.105 530.105 241 529 241H528C526.895 241 526 240.105 526 239C526 237.895 526.895 237 528 237ZM521 237H520C518.895 237 518 237.895 518 239C518 240.105 518.895 241 520 241H521C522.105 241 523 240.105 523 239C523 237.895 522.105 237 521 237ZM512 237H513C514.105 237 515 237.895 515 239C515 240.105 514.105 241 513 241H512C510.895 241 510 240.105 510 239C510 237.895 510.895 237 512 237ZM505 237H504C502.895 237 502 237.895 502 239C502 240.105 502.895 241 504 241H505C506.105 241 507 240.105 507 239C507 237.895 506.105 237 505 237ZM615.5 238V239C615.5 240.105 614.605 241 613.5 241C612.395 241 611.5 240.105 611.5 239V238C611.5 236.895 612.395 236 613.5 236C614.605 236 615.5 236.895 615.5 238ZM497 237H496C494.895 237 494 237.895 494 239C494 240.105 494.895 241 496 241H497C498.105 241 499 240.105 499 239C499 237.895 498.105 237 497 237ZM488 237H489C490.105 237 491 237.895 491 239C491 240.105 490.105 241 489 241H488C486.895 241 486 240.105 486 239C486 237.895 486.895 237 488 237ZM481 237H480C478.895 237 478 237.895 478 239C478 240.105 478.895 241 480 241H481C482.105 241 483 240.105 483 239C483 237.895 482.105 237 481 237ZM472 237H473C474.105 237 475 237.895 475 239C475 240.105 474.105 241 473 241H472C470.895 241 470 240.105 470 239C470 237.895 470.895 237 472 237ZM465 237H464C462.895 237 462 237.895 462 239C462 240.105 462.895 241 464 241H465C466.105 241 467 240.105 467 239C467 237.895 466.105 237 465 237ZM456 237H457C458.105 237 459 237.895 459 239C459 240.105 458.105 241 457 241H456C454.895 241 454 240.105 454 239C454 237.895 454.895 237 456 237ZM449 237H448C446.895 237 446 237.895 446 239C446 240.105 446.895 241 448 241H449C450.105 241 451 240.105 451 239C451 237.895 450.105 237 449 237ZM440 237H441C442.105 237 443 237.895 443 239C443 240.105 442.105 241 441 241H440C438.895 241 438 240.105 438 239C438 237.895 438.895 237 440 237ZM433 237H432C430.895 237 430 237.895 430 239C430 240.105 430.895 241 432 241H433C434.105 241 435 240.105 435 239C435 237.895 434.105 237 433 237ZM424 237H425C426.105 237 427 237.895 427 239C427 240.105 426.105 241 425 241H424C422.895 241 422 240.105 422 239C422 237.895 422.895 237 424 237ZM417 237H416C414.895 237 414 237.895 414 239C414 240.105 414.895 241 416 241H417C418.105 241 419 240.105 419 239C419 237.895 418.105 237 417 237ZM408 237H409C410.105 237 411 237.895 411 239C411 240.105 410.105 241 409 241H408C406.895 241 406 240.105 406 239C406 237.895 406.895 237 408 237ZM401 237H400C398.895 237 398 237.895 398 239C398 240.105 398.895 241 400 241H401C402.105 241 403 240.105 403 239C403 237.895 402.105 237 401 237ZM392 237H393C394.105 237 395 237.895 395 239C395 240.105 394.105 241 393 241H392C390.895 241 390 240.105 390 239C390 237.895 390.895 237 392 237ZM385 237H384C382.895 237 382 237.895 382 239C382 240.105 382.895 241 384 241H385C386.105 241 387 240.105 387 239C387 237.895 386.105 237 385 237ZM376 237H377C378.105 237 379 237.895 379 239C379 240.105 378.105 241 377 241H376C374.895 241 374 240.105 374 239C374 237.895 374.895 237 376 237ZM369 237H368C366.895 237 366 237.895 366 239C366 240.105 366.895 241 368 241H369C370.105 241 371 240.105 371 239C371 237.895 370.105 237 369 237ZM360 237H361C362.105 237 363 237.895 363 239C363 240.105 362.105 241 361 241H360C358.895 241 358 240.105 358 239C358 237.895 358.895 237 360 237ZM353 237H352C350.895 237 350 237.895 350 239C350 240.105 350.895 241 352 241H353C354.105 241 355 240.105 355 239C355 237.895 354.105 237 353 237ZM344 237H345C346.105 237 347 237.895 347 239C347 240.105 346.105 241 345 241H344C342.895 241 342 240.105 342 239C342 237.895 342.895 237 344 237ZM337 237H336C334.895 237 334 237.895 334 239C334 240.105 334.895 241 336 241H337C338.105 241 339 240.105 339 239C339 237.895 338.105 237 337 237ZM328 237H329C330.105 237 331 237.895 331 239C331 240.105 330.105 241 329 241H328C326.895 241 326 240.105 326 239C326 237.895 326.895 237 328 237ZM321 237H320C318.895 237 318 237.895 318 239C318 240.105 318.895 241 320 241H321C322.105 241 323 240.105 323 239C323 237.895 322.105 237 321 237ZM312 237H313C314.105 237 315 237.895 315 239C315 240.105 314.105 241 313 241H312C310.895 241 310 240.105 310 239C310 237.895 310.895 237 312 237ZM305 237H304C302.895 237 302 237.895 302 239C302 240.105 302.895 241 304 241H305C306.105 241 307 240.105 307 239C307 237.895 306.105 237 305 237ZM296 237H297C298.105 237 299 237.895 299 239C299 240.105 298.105 241 297 241H296C294.895 241 294 240.105 294 239C294 237.895 294.895 237 296 237ZM289 237H288C286.895 237 286 237.895 286 239C286 240.105 286.895 241 288 241H289C290.105 241 291 240.105 291 239C291 237.895 290.105 237 289 237ZM280 237H281C282.105 237 283 237.895 283 239C283 240.105 282.105 241 281 241H280C278.895 241 278 240.105 278 239C278 237.895 278.895 237 280 237ZM273 237H272C270.895 237 270 237.895 270 239C270 240.105 270.895 241 272 241H273C274.105 241 275 240.105 275 239C275 237.895 274.105 237 273 237ZM264 237H265C266.105 237 267 237.895 267 239C267 240.105 266.105 241 265 241H264C262.895 241 262 240.105 262 239C262 237.895 262.895 237 264 237ZM257 237H256C254.895 237 254 237.895 254 239C254 240.105 254.895 241 256 241H257C258.105 241 259 240.105 259 239C259 237.895 258.105 237 257 237ZM248 237H249C250.105 237 251 237.895 251 239C251 240.105 250.105 241 249 241H248C246.895 241 246 240.105 246 239C246 237.895 246.895 237 248 237ZM241 237H240C238.895 237 238 237.895 238 239C238 240.105 238.895 241 240 241H241C242.105 241 243 240.105 243 239C243 237.895 242.105 237 241 237ZM232 237H233C234.105 237 235 237.895 235 239C235 240.105 234.105 241 233 241H232C230.895 241 230 240.105 230 239C230 237.895 230.895 237 232 237ZM225 237H224C222.895 237 222 237.895 222 239C222 240.105 222.895 241 224 241H225C226.105 241 227 240.105 227 239C227 237.895 226.105 237 225 237ZM216 237H217C218.105 237 219 237.895 219 239C219 240.105 218.105 241 217 241H216C214.895 241 214 240.105 214 239C214 237.895 214.895 237 216 237ZM209 237H208C206.895 237 206 237.895 206 239C206 240.105 206.895 241 208 241H209C210.105 241 211 240.105 211 239C211 237.895 210.105 237 209 237ZM200 237H201C202.105 237 203 237.895 203 239C203 240.105 202.105 241 201 241H200C198.895 241 198 240.105 198 239C198 237.895 198.895 237 200 237ZM193 237H192C190.895 237 190 237.895 190 239C190 240.105 190.895 241 192 241H193C194.105 241 195 240.105 195 239C195 237.895 194.105 237 193 237ZM184 237H185C186.105 237 187 237.895 187 239C187 240.105 186.105 241 185 241H184C182.895 241 182 240.105 182 239C182 237.895 182.895 237 184 237ZM177 237H176C174.895 237 174 237.895 174 239C174 240.105 174.895 241 176 241H177C178.105 241 179 240.105 179 239C179 237.895 178.105 237 177 237ZM168 237H169C170.105 237 171 237.895 171 239C171 240.105 170.105 241 169 241H168C166.895 241 166 240.105 166 239C166 237.895 166.895 237 168 237ZM161 237H160C158.895 237 158 237.895 158 239C158 240.105 158.895 241 160 241H161C162.105 241 163 240.105 163 239C163 237.895 162.105 237 161 237ZM152 237H153C154.105 237 155 237.895 155 239C155 240.105 154.105 241 153 241H152C150.895 241 150 240.105 150 239C150 237.895 150.895 237 152 237ZM145 237H144C142.895 237 142 237.895 142 239C142 240.105 142.895 241 144 241H145C146.105 241 147 240.105 147 239C147 237.895 146.105 237 145 237ZM136 237H137C138.105 237 139 237.895 139 239C139 240.105 138.105 241 137 241H136C134.895 241 134 240.105 134 239C134 237.895 134.895 237 136 237ZM129 237H128C126.895 237 126 237.895 126 239C126 240.105 126.895 241 128 241H129C130.105 241 131 240.105 131 239C131 237.895 130.105 237 129 237ZM120 237H121C122.105 237 123 237.895 123 239C123 240.105 122.105 241 121 241H120C118.895 241 118 240.105 118 239C118 237.895 118.895 237 120 237ZM113 237H112C110.895 237 110 237.895 110 239C110 240.105 110.895 241 112 241H113C114.105 241 115 240.105 115 239C115 237.895 114.105 237 113 237ZM104 237H105C106.105 237 107 237.895 107 239C107 240.105 106.105 241 105 241H104C102.895 241 102 240.105 102 239C102 237.895 102.895 237 104 237ZM96 237H95C93.8954 237 93 237.895 93 239C93 240.105 93.8954 241 95 241H96C97.1046 241 98 240.105 98 239C98 237.895 97.1046 237 96 237ZM97 231V232C97 233.105 96.1046 234 95 234C93.8954 234 93 233.105 93 232V231C93 229.895 93.8954 229 95 229C96.1046 229 97 229.895 97 231Z" fill="url(#paint4_linear_202_2)"/>
+<defs>
+<linearGradient id="paint0_linear_202_2" x1="250.034" y1="222.598" x2="223.021" y2="222.598" gradientUnits="userSpaceOnUse">
+<stop stop-color="#B2D73A"/>
+<stop offset="1" stop-color="#F0A439"/>
+</linearGradient>
+<linearGradient id="paint1_linear_202_2" x1="173.136" y1="211.231" x2="159.521" y2="211.231" gradientUnits="userSpaceOnUse">
+<stop stop-color="#CDB217"/>
+<stop offset="1" stop-color="#F0A439"/>
+</linearGradient>
+<linearGradient id="paint2_linear_202_2" x1="475.5" y1="26" x2="232" y2="26" gradientUnits="userSpaceOnUse">
+<stop stop-color="#3AD787"/>
+<stop offset="1" stop-color="#CDB217"/>
+</linearGradient>
+<linearGradient id="paint3_linear_202_2" x1="614.5" y1="26" x2="491" y2="26" gradientUnits="userSpaceOnUse">
+<stop stop-color="#3A8DD7"/>
+<stop offset="1" stop-color="#3AD787"/>
+</linearGradient>
+<linearGradient id="paint4_linear_202_2" x1="618" y1="192.5" x2="93" y2="188" gradientUnits="userSpaceOnUse">
+<stop offset="0.34375" stop-color="#3A8DD7"/>
+<stop offset="0.489583" stop-color="#F0A439"/>
+</linearGradient>
+</defs>
 </svg>
diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index 92d1b0172..9ee96528e 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -30,10 +30,16 @@ into three components:
    tagging, parsing, lemmatization and named entity recognition, or `dep` for
    only tagging, parsing and lemmatization).
 2. **Genre:** Type of text the pipeline is trained on, e.g. `web` or `news`.
-3. **Size:** Package size indicator, `sm`, `md`, `lg` or `trf` (`sm`: no word
-   vectors, `md`: reduced word vector table with 20k unique vectors for ~500k
-   words, `lg`: large word vector table with ~500k entries, `trf`: transformer
-   pipeline without static word vectors)
+3. **Size:** Package size indicator, `sm`, `md`, `lg` or `trf`.
+
+   `sm` and `trf` pipelines have no static word vectors.
+
+   For pipelines with default vectors, `md` has a reduced word vector table with
+   20k unique vectors for ~500k words and `lg` has a large word vector table
+   with ~500k entries.
+
+   For pipelines with floret vectors, `md` vector tables have 50k entries and
+   `lg` vector tables have 200k entries.
 
 For example, [`en_core_web_sm`](/models/en#en_core_web_sm) is a small English
 pipeline trained on written web text (blogs, news, comments), that includes
@@ -90,19 +96,42 @@ Main changes from spaCy v2 models:
 In the `sm`/`md`/`lg` models:
 
 - The `tagger`, `morphologizer` and `parser` components listen to the `tok2vec`
-  component.
+  component. If the lemmatizer is trainable (v3.3+), `lemmatizer` also listens
+  to `tok2vec`.
 - The `attribute_ruler` maps `token.tag` to `token.pos` if there is no
   `morphologizer`. The `attribute_ruler` additionally makes sure whitespace is
   tagged consistently and copies `token.pos` to `token.tag` if there is no
   tagger. For English, the attribute ruler can improve its mapping from
   `token.tag` to `token.pos` if dependency parses from a `parser` are present,
   but the parser is not required.
-- The `lemmatizer` component for many languages (Catalan, Dutch, English,
-  French, Greek, Italian Macedonian, Norwegian, Polish and Spanish) requires
-  `token.pos` annotation from either `tagger`+`attribute_ruler` or
-  `morphologizer`.
+- The `lemmatizer` component for many languages requires `token.pos` annotation
+  from either `tagger`+`attribute_ruler` or `morphologizer`.
 - The `ner` component is independent with its own internal tok2vec layer.
 
+#### CNN/CPU pipelines with floret vectors
+
+The Finnish, Korean and Swedish `md` and `lg` pipelines use
+[floret vectors](/usage/v3-2#vectors) instead of default vectors. If you're
+running a trained pipeline on texts and working with [`Doc`](/api/doc) objects,
+you shouldn't notice any difference with floret vectors. With floret vectors no
+tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will
+return `True` for all tokens.
+
+If you access vectors directly for similarity comparisons, there are a few
+differences because floret vectors don't include a fixed word list like the
+vector keys for default vectors.
+
+- If your workflow iterates over the vector keys, you need to use an external
+  word list instead:
+
+  ```diff
+  - lexemes = [nlp.vocab[orth] for orth in nlp.vocab.vectors]
+  + lexemes = [nlp.vocab[word] for word in external_word_list]
+  ```
+
+- [`Vectors.most_similar`](/api/vectors#most_similar) is not supported because
+  there's no fixed list of vectors to compare your vectors to.
+
 ### Transformer pipeline design {#design-trf}
 
 In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
@@ -133,10 +162,14 @@ nlp = spacy.load("en_core_web_trf", disable=["tagger", "attribute_ruler", "lemma
 <Infobox variant="warning" title="Rule-based and POS-lookup lemmatizers require
 Token.pos">
 
-The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for
-Catalan, Dutch, English, French, Greek, Italian, Macedonian, Norwegian, Polish
-and Spanish. If you disable any of these components, you'll see lemmatizer
-warnings unless the lemmatizer is also disabled.
+The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for a
+number of languages. If you disable any of these components, you'll see
+lemmatizer warnings unless the lemmatizer is also disabled.
+
+**v3.3**: Catalan, English, French, Russian and Spanish
+
+**v3.0-v3.2**: Catalan, Dutch, English, French, Greek, Italian, Macedonian,
+Norwegian, Polish, Russian and Spanish
 
 </Infobox>
 
@@ -154,10 +187,34 @@ nlp.enable_pipe("senter")
 The `senter` component is ~10&times; faster than the parser and more accurate
 than the rule-based `sentencizer`.
 
+#### Switch from trainable lemmatizer to default lemmatizer
+
+Since v3.3, a number of pipelines use a trainable lemmatizer. You can check whether
+the lemmatizer is trainable:
+
+```python
+nlp = spacy.load("de_core_web_sm")
+assert nlp.get_pipe("lemmatizer").is_trainable
+```
+
+If you'd like to switch to a non-trainable lemmatizer that's similar to v3.2 or
+earlier, you can replace the trainable lemmatizer with the default non-trainable
+lemmatizer:
+
+```python
+# Requirements: pip install spacy-lookups-data
+nlp = spacy.load("de_core_web_sm")
+# Remove existing lemmatizer
+nlp.remove_pipe("lemmatizer")
+# Add non-trainable lemmatizer from language defaults
+# and load lemmatizer tables from spacy-lookups-data
+nlp.add_pipe("lemmatizer").initialize()
+```
+
 #### Switch from rule-based to lookup lemmatization
 
 For the Dutch, English, French, Greek, Macedonian, Norwegian and Spanish
-pipelines, you can switch from the default rule-based lemmatizer to a lookup
+pipelines, you can swap out a trainable or rule-based lemmatizer for a lookup
 lemmatizer:
 
 ```python
diff --git a/website/docs/usage/v3-3.md b/website/docs/usage/v3-3.md
new file mode 100644
index 000000000..739e2a2f9
--- /dev/null
+++ b/website/docs/usage/v3-3.md
@@ -0,0 +1,247 @@
+---
+title: What's New in v3.3
+teaser: New features and how to upgrade
+menu:
+  - ['New Features', 'features']
+  - ['Upgrading Notes', 'upgrading']
+---
+
+## New features {#features hidden="true"}
+
+spaCy v3.3 improves the speed of core pipeline components, adds a new trainable
+lemmatizer, and introduces trained pipelines for Finnish, Korean and Swedish.
+
+### Speed improvements {#speed}
+
+v3.3 includes a slew of speed improvements:
+
+- Speed up parser and NER by using constant-time head lookups.
+- Support unnormalized softmax probabilities in `spacy.Tagger.v2` to speed up
+  inference for tagger, morphologizer, senter and trainable lemmatizer.
+- Speed up parser projectivization functions.
+- Replace `Ragged` with faster `AlignmentArray` in `Example` for training.
+- Improve `Matcher` speed.
+- Improve serialization speed for empty `Doc.spans`.
+
+For longer texts, the trained pipeline speeds improve **15%** or more in
+prediction. We benchmarked `en_core_web_md` (same components as in v3.2) and
+`de_core_news_md` (with the new trainable lemmatizer) across a range of text
+sizes on Linux (Intel Xeon W-2265) and OS X (M1) to compare spaCy v3.2 vs. v3.3:
+
+**Intel Xeon W-2265**
+
+| Model                                            | Avg. Words/Doc | v3.2 Words/Sec | v3.3 Words/Sec |   Diff |
+| :----------------------------------------------- | -------------: | -------------: | -------------: | -----: |
+| [`en_core_web_md`](/models/en#en_core_web_md)    |            100 |          17292 |          17441 |  0.86% |
+| (=same components)                               |           1000 |          15408 |          16024 |  4.00% |
+|                                                  |          10000 |          12798 |          15346 | 19.91% |
+| [`de_core_news_md`](/models/de/#de_core_news_md) |            100 |          20221 |          19321 | -4.45% |
+| (+v3.3 trainable lemmatizer)                     |           1000 |          17480 |          17345 | -0.77% |
+|                                                  |          10000 |          14513 |          17036 | 17.38% |
+
+**Apple M1**
+
+| Model                                            | Avg. Words/Doc | v3.2 Words/Sec | v3.3 Words/Sec |   Diff |
+| ------------------------------------------------ | -------------: | -------------: | -------------: | -----: |
+| [`en_core_web_md`](/models/en#en_core_web_md)    |            100 |          18272 |          18408 |  0.74% |
+| (=same components)                               |           1000 |          18794 |          19248 |  2.42% |
+|                                                  |          10000 |          15144 |          17513 | 15.64% |
+| [`de_core_news_md`](/models/de/#de_core_news_md) |            100 |          19227 |          19591 |  1.89% |
+| (+v3.3 trainable lemmatizer)                     |           1000 |          20047 |          20628 |  2.90% |
+|                                                  |          10000 |          15921 |          18546 | 16.49% |
+
+### Trainable lemmatizer {#trainable-lemmatizer}
+
+The new [trainable lemmatizer](/api/edittreelemmatizer) component uses
+[edit trees](https://explosion.ai/blog/edit-tree-lemmatizer) to transform tokens
+into lemmas. Try out the trainable lemmatizer with the
+[training quickstart](/usage/training#quickstart)!
+
+### displaCy support for overlapping spans and arcs {#displacy}
+
+displaCy now supports overlapping spans with a new
+[`span`](/usage/visualizers#span) style and multiple arcs with different labels
+between the same tokens for [`dep`](/usage/visualizers#dep) visualizations.
+
+Overlapping spans can be visualized for any spans key in `doc.spans`:
+
+```python
+import spacy
+from spacy import displacy
+from spacy.tokens import Span
+
+nlp = spacy.blank("en")
+text = "Welcome to the Bank of China."
+doc = nlp(text)
+doc.spans["custom"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
+displacy.serve(doc, style="span", options={"spans_key": "custom"})
+```
+
+import DisplacySpanHtml from 'images/displacy-span.html'
+
+<Iframe title="displaCy visualizer for overlapping spans" html={DisplacySpanHtml} height={180} />
+
+## Additional features and improvements
+
+- Config comparisons with [`spacy debug diff-config`](/api/cli#debug-diff).
+- Span suggester debugging with
+  [`SpanCategorizer.set_candidates`](/api/spancategorizer#set_candidates).
+- Big endian support with
+  [`thinc-bigendian-ops`](https://github.com/andrewsi-z/thinc-bigendian-ops) and
+  updates to make `floret`, `murmurhash`, Thinc and spaCy endian neutral.
+- Initial support for Lower Sorbian and Upper Sorbian.
+- Language updates for English, French, Italian, Japanese, Korean, Norwegian,
+  Russian, Slovenian, Spanish, Turkish, Ukrainian and Vietnamese.
+- New noun chunks for Finnish.
+
+## Trained pipelines {#pipelines}
+
+### New trained pipelines {#new-pipelines}
+
+v3.3 introduces new CPU/CNN pipelines for Finnish, Korean and Swedish, which use
+the new trainable lemmatizer and
+[floret vectors](https://github.com/explosion/floret). Due to the use
+[Bloom embeddings](https://explosion.ai/blog/bloom-embeddings) and subwords, the
+pipelines have compact vectors with no out-of-vocabulary words.
+
+| Package                                         | Language | UPOS | Parser LAS | NER F |
+| ----------------------------------------------- | -------- | ---: | ---------: | ----: |
+| [`fi_core_news_sm`](/models/fi#fi_core_news_sm) | Finnish  | 92.5 |       71.9 |  75.9 |
+| [`fi_core_news_md`](/models/fi#fi_core_news_md) | Finnish  | 95.9 |       78.6 |  80.6 |
+| [`fi_core_news_lg`](/models/fi#fi_core_news_lg) | Finnish  | 96.2 |       79.4 |  82.4 |
+| [`ko_core_news_sm`](/models/ko#ko_core_news_sm) | Korean   | 86.1 |       65.6 |  71.3 |
+| [`ko_core_news_md`](/models/ko#ko_core_news_md) | Korean   | 94.7 |       80.9 |  83.1 |
+| [`ko_core_news_lg`](/models/ko#ko_core_news_lg) | Korean   | 94.7 |       81.3 |  85.3 |
+| [`sv_core_news_sm`](/models/sv#sv_core_news_sm) | Swedish  | 95.0 |       75.9 |  74.7 |
+| [`sv_core_news_md`](/models/sv#sv_core_news_md) | Swedish  | 96.3 |       78.5 |  79.3 |
+| [`sv_core_news_lg`](/models/sv#sv_core_news_lg) | Swedish  | 96.3 |       79.1 |  81.1 |
+
+### Pipeline updates {#pipeline-updates}
+
+The following languages switch from lookup or rule-based lemmatizers to the new
+trainable lemmatizer: Danish, Dutch, German, Greek, Italian, Lithuanian,
+Norwegian, Polish, Portuguese and Romanian. The overall lemmatizer accuracy
+improves for all of these pipelines, but be aware that the types of errors may
+look quite different from the lookup-based lemmatizers. If you'd prefer to
+continue using the previous lemmatizer, you can
+[switch from the trainable lemmatizer to a non-trainable lemmatizer](/models#design-modify).
+
+<figure>
+
+| Model                                           | v3.2 Lemma Acc | v3.3 Lemma Acc |
+| ----------------------------------------------- | -------------: | -------------: |
+| [`da_core_news_md`](/models/da#da_core_news_md) |           84.9 |           94.8 |
+| [`de_core_news_md`](/models/de#de_core_news_md) |           73.4 |           97.7 |
+| [`el_core_news_md`](/models/el#el_core_news_md) |           56.5 |           88.9 |
+| [`fi_core_news_md`](/models/fi#fi_core_news_md) |              - |           86.2 |
+| [`it_core_news_md`](/models/it#it_core_news_md) |           86.6 |           97.2 |
+| [`ko_core_news_md`](/models/ko#ko_core_news_md) |              - |           90.0 |
+| [`lt_core_news_md`](/models/lt#lt_core_news_md) |           71.1 |           84.8 |
+| [`nb_core_news_md`](/models/nb#nb_core_news_md) |           76.7 |           97.1 |
+| [`nl_core_news_md`](/models/nl#nl_core_news_md) |           81.5 |           94.0 |
+| [`pl_core_news_md`](/models/pl#pl_core_news_md) |           87.1 |           93.7 |
+| [`pt_core_news_md`](/models/pt#pt_core_news_md) |           76.7 |           96.9 |
+| [`ro_core_news_md`](/models/ro#ro_core_news_md) |           81.8 |           95.5 |
+| [`sv_core_news_md`](/models/sv#sv_core_news_md) |              - |           95.5 |
+
+</figure>
+
+In addition, the vectors in the English pipelines are deduplicated to improve
+the pruned vectors in the `md` models and reduce the `lg` model size.
+
+## Notes about upgrading from v3.2 {#upgrading}
+
+### Span comparisons
+
+Span comparisons involving ordering (`<`, `<=`, `>`, `>=`) now take all span
+attributes into account (start, end, label, and KB ID) so spans may be sorted in
+a slightly different order.
+
+### Whitespace annotation
+
+During training, annotation on whitespace tokens is handled in the same way as
+annotation on non-whitespace tokens in order to allow custom whitespace
+annotation.
+
+### Doc.from_docs
+
+[`Doc.from_docs`](/api/doc#from_docs) now includes `Doc.tensor` by default and
+supports excludes with an `exclude` argument in the same format as
+`Doc.to_bytes`. The supported exclude fields are `spans`, `tensor` and
+`user_data`.
+
+Docs including `Doc.tensor` may be quite a bit larger in RAM, so to exclude
+`Doc.tensor` as in v3.2:
+
+```diff
+-merged_doc = Doc.from_docs(docs)
++merged_doc = Doc.from_docs(docs, exclude=["tensor"])
+```
+
+### Using trained pipelines with floret vectors
+
+If you're running a new trained pipeline for Finnish, Korean or Swedish on new
+texts and working with `Doc` objects, you shouldn't notice any difference with
+floret vectors vs. default vectors.
+
+If you use vectors for similarity comparisons, there are a few differences,
+mainly because a floret pipeline doesn't include any kind of frequency-based
+word list similar to the list of in-vocabulary vector keys with default vectors.
+
+- If your workflow iterates over the vector keys, you should use an external
+  word list instead:
+
+  ```diff
+  - lexemes = [nlp.vocab[orth] for orth in nlp.vocab.vectors]
+  + lexemes = [nlp.vocab[word] for word in external_word_list]
+  ```
+
+- `Vectors.most_similar` is not supported because there's no fixed list of
+  vectors to compare your vectors to.
+
+### Pipeline package version compatibility {#version-compat}
+
+> #### Using legacy implementations
+>
+> In spaCy v3, you'll still be able to load and reference legacy implementations
+> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
+> components or architectures change and newer versions are available in the
+> core library.
+
+When you're loading a pipeline package trained with an earlier version of spaCy
+v3, you will see a warning telling you that the pipeline may be incompatible.
+This doesn't necessarily have to be true, but we recommend running your
+pipelines against your test suite or evaluation data to make sure there are no
+unexpected results.
+
+If you're using one of the [trained pipelines](/models) we provide, you should
+run [`spacy download`](/api/cli#download) to update to the latest version. To
+see an overview of all installed packages and their compatibility, you can run
+[`spacy validate`](/api/cli#validate).
+
+If you've trained your own custom pipeline and you've confirmed that it's still
+working as expected, you can update the spaCy version requirements in the
+[`meta.json`](/api/data-formats#meta):
+
+```diff
+- "spacy_version": ">=3.2.0,<3.3.0",
++ "spacy_version": ">=3.2.0,<3.4.0",
+```
+
+### Updating v3.2 configs
+
+To update a config from spaCy v3.2 with the new v3.3 settings, run
+[`init fill-config`](/api/cli#init-fill-config):
+
+```cli
+$ python -m spacy init fill-config config-v3.2.cfg config-v3.3.cfg
+```
+
+In many cases ([`spacy train`](/api/cli#train),
+[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
+automatically, but you'll need to fill in the new settings to run
+[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
+
+To see the speed improvements for the
+[`Tagger` architecture](/api/architectures#Tagger), edit your config to switch
+from `spacy.Tagger.v1` to `spacy.Tagger.v2` and then run `init fill-config`.
diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md
index 770448c5a..d2892b863 100644
--- a/website/docs/usage/visualizers.md
+++ b/website/docs/usage/visualizers.md
@@ -5,6 +5,7 @@ new: 2
 menu:
   - ['Dependencies', 'dep']
   - ['Named Entities', 'ent']
+  - ['Spans', 'span']
   - ['Jupyter Notebooks', 'jupyter']
   - ['Rendering HTML', 'html']
   - ['Web app usage', 'webapp']
@@ -192,7 +193,7 @@ displacy.serve(doc, style="span")
 
 import DisplacySpanHtml from 'images/displacy-span.html'
 
-<Iframe title="displaCy visualizer for entities" html={DisplacySpanHtml} height={180} />
+<Iframe title="displaCy visualizer for overlapping spans" html={DisplacySpanHtml} height={180} />
 
 
 The span visualizer lets you customize the following `options`:
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 1c4379b6d..64ca7a082 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -62,6 +62,11 @@
             "example": "Dies ist ein Satz.",
             "has_examples": true
         },
+        {
+            "code": "dsb",
+            "name": "Lower Sorbian",
+	    "has_examples": true
+        },
         {
             "code": "el",
             "name": "Greek",
@@ -159,6 +164,11 @@
             "name": "Croatian",
             "has_examples": true
         },
+        {
+            "code": "hsb",
+            "name": "Upper Sorbian",
+	    "has_examples": true
+        },
         {
             "code": "hu",
             "name": "Hungarian",
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 2229c91f3..cf3f1398e 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -11,7 +11,8 @@
                     { "text": "spaCy 101", "url": "/usage/spacy-101" },
                     { "text": "New in v3.0", "url": "/usage/v3" },
                     { "text": "New in v3.1", "url": "/usage/v3-1" },
-                    { "text": "New in v3.2", "url": "/usage/v3-2" }
+                    { "text": "New in v3.2", "url": "/usage/v3-2" },
+                    { "text": "New in v3.3", "url": "/usage/v3-3" }
                 ]
             },
             {
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index dfd59e424..bdbdbd431 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -120,8 +120,8 @@ const AlertSpace = ({ nightly, legacy }) => {
 }
 
 const navAlert = (
-    <Link to="/usage/v3-2" hidden>
-        <strong>💥 Out now:</strong> spaCy v3.2
+    <Link to="/usage/v3-3" hidden>
+        <strong>💥 Out now:</strong> spaCy v3.3
     </Link>
 )