From 28299644fc14ed7693a26bf03e2ec0cbef9c28e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 13 Jan 2022 09:03:55 +0100
Subject: [PATCH 01/16] Speed up the StateC::L feature function (#10019)

* Speed up the StateC::L feature function

This function gets the n-th most-recent left-arc with a particular head.
Before this change, StateC::L would construct a vector of all left-arcs
with the given head and then pick the n-th most recent from that vector.
Since the number of left-arcs strongly correlates with the doc length
and the feature is constructed for every transition, this can make
transition-parsing quadratic.

With this change StateC::L:

- Searches left-arcs backwards.
- Stops early when the n-th matching transition is found.
- Does not construct a vector (reducing memory pressure).

This change doesn't avoid the linear search when the transition that is
queried does not occur in the left-arcs. Regardless, performance is
improved quite a bit with very long docs:

Before:

   N  Time

 400   3.3
 800   5.4
1600  11.6
3200  30.7

After:

   N  Time

 400   3.2
 800   5.0
1600   9.5
3200  23.2

We can probably do better with more tailored data structures, but I
first wanted to make a low-impact PR.

Found while investigating #9858.

* StateC::L: simplify loop
---
 spacy/pipeline/_parser_internals/_state.pxd | 23 +++++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 161f3ca48..27623e7c6 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,3 +1,4 @@
+from cython.operator cimport dereference as deref, preincrement as incr
 from libc.string cimport memcpy, memset
 from libc.stdlib cimport calloc, free
 from libc.stdint cimport uint32_t, uint64_t
@@ -184,16 +185,20 @@ cdef cppclass StateC:
     int L(int head, int idx) nogil const:
         if idx < 1 or this._left_arcs.size() == 0:
             return -1
-        cdef vector[int] lefts
-        for i in range(this._left_arcs.size()):
-            arc = this._left_arcs.at(i)
+
+        # Work backwards through left-arcs to find the arc at the
+        # requested index more quickly.
+        cdef size_t child_index = 0
+        it = this._left_arcs.const_rbegin()
+        while it != this._left_arcs.rend():
+            arc = deref(it)
             if arc.head == head and arc.child != -1 and arc.child < head:
-                lefts.push_back(arc.child)
-        idx = (<int>lefts.size()) - idx
-        if idx < 0:
-            return -1
-        else:
-            return lefts.at(idx)
+                child_index += 1
+                if child_index == idx:
+                    return arc.child
+            incr(it)
+
+        return -1
 
     int R(int head, int idx) nogil const:
         if idx < 1 or this._right_arcs.size() == 0:

From a784b12eff48df9281b184cb7005e66bbd2e3aca Mon Sep 17 00:00:00 2001
From: ColleterVi <36503688+ColleterVi@users.noreply.github.com>
Date: Thu, 13 Jan 2022 12:25:06 +0100
Subject: [PATCH 02/16] fix: new restcountries url (#10043)

Url extension "eu" and path "rest" are no longer available. Replacing them for a working url.
---
 website/docs/usage/processing-pipelines.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 0264a2825..11fd1459d 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1479,7 +1479,7 @@ especially useful it you want to pass in a string instead of calling
 ### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3}
 
 This example shows the implementation of a pipeline component that fetches
-country meta data via the [REST Countries API](https://restcountries.eu), sets
+country meta data via the [REST Countries API](https://restcountries.com), sets
 entity annotations for countries and sets custom attributes on the `Doc` and
 `Span` – for example, the capital, latitude/longitude coordinates and even the
 country flag.
@@ -1495,7 +1495,7 @@ from spacy.tokens import Doc, Span, Token
 @Language.factory("rest_countries")
 class RESTCountriesComponent:
     def __init__(self, nlp, name, label="GPE"):
-        r = requests.get("https://restcountries.eu/rest/v2/all")
+        r = requests.get("https://restcountries.com/v2/all")
         r.raise_for_status()  # make sure requests raises an error if it fails
         countries = r.json()
         # Convert API response to dict keyed by country name for easy lookup

From 58bdd8607bb917f3437fdf5993dec5b6e58930c8 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 17 Jan 2022 16:16:22 +0900
Subject: [PATCH 03/16] Bump sudachipy version (#9917)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Edited Slovenian stop words list (#9707)

* Noun chunks for Italian (#9662)

* added it vocab

* copied portuguese

* added possessive determiner

* added conjed Nps

* added nmoded Nps

* test misc

* more examples

* fixed typo

* fixed parenth

* fixed comma

* comma fix

* added syntax iters

* fix some index problems

* fixed index

* corrected heads for test case

* fixed tets case

* fixed determiner gender

* cleaned left over

* added example with apostophe

* French NP review (#9667)

* adapted from pt

* added basic tests

* added fr vocab

* fixed noun chunks

* more examples

* typo fix

* changed naming

* changed the naming

* typo fix

* Add Japanese kana characters to default exceptions (fix #9693) (#9742)

This includes the main kana, or phonetic characters, used in Japanese.

There are some supplemental kana blocks in Unicode outside the BMP that
could also be included, but because their actual use is rare I omitted
them for now, but maybe they should be added. The omitted blocks are:

- Kana Supplement
- Kana Extended (A and B)
- Small Kana Extension

* Remove NER words from stop words in Norwegian (#9820)

Default stop words in Norwegian bokmål (nb) in Spacy contain important entities, e.g. France, Germany, Russia, Sweden and USA, police district, important units of time, e.g. months and days of the week, and organisations.

Nobody expects their presence among the default stop words. There is a danger of users complying with the general recommendation of filtering out stop words, while being unaware of filtering out important entities from their data.

See explanation in https://github.com/explosion/spaCy/issues/3052#issuecomment-986756711 and comment https://github.com/explosion/spaCy/issues/3052#issuecomment-986951831

* Bump sudachipy version

* Update sudachipy versions

* Bump versions

Bumping to the most recent dictionary just to keep thing current.
Bumping sudachipy to 5.2 because older versions don't support recent
dictionaries.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Richard Hudson <richard@explosion.ai>
Co-authored-by: Duygu Altinok <duygu@explosion.ai>
Co-authored-by: Haakon Meland Eriksen <haakon.eriksen@far.no>
---
 setup.cfg                               |   4 +-
 spacy/lang/char_classes.py              |   5 +
 spacy/lang/fr/syntax_iterators.py       |  72 ++++++--
 spacy/lang/it/__init__.py               |   4 +-
 spacy/lang/it/syntax_iterators.py       |  86 +++++++++
 spacy/lang/nb/stop_words.py             |  30 ++--
 spacy/lang/sl/stop_words.py             | 130 +-------------
 spacy/tests/conftest.py                 |  10 ++
 spacy/tests/lang/fr/test_noun_chunks.py | 224 +++++++++++++++++++++++-
 spacy/tests/lang/it/test_noun_chunks.py | 221 +++++++++++++++++++++++
 10 files changed, 624 insertions(+), 162 deletions(-)
 create mode 100644 spacy/lang/it/syntax_iterators.py
 create mode 100644 spacy/tests/lang/it/test_noun_chunks.py

diff --git a/setup.cfg b/setup.cfg
index 50e982cbf..586a044ff 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -108,8 +108,8 @@ apple =
     thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
-    sudachipy>=0.4.9
-    sudachidict_core>=20200330
+    sudachipy>=0.5.2,!=0.6.1
+    sudachidict_core>=20211220
 ko =
     natto-py==0.9.0
 th =
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 9e5441a4f..b15bb3cf3 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
 _hangul_jamo = r"\u1100-\u11FF"
 _hangul = _hangul_syllables + _hangul_jamo
 
+_hiragana = r"\u3040-\u309F"
+_katakana = r"\u30A0-\u30FFー"
+_kana = _hiragana + _katakana
+
 # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
 _latin_u_extendedA = (
     r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@@ -244,6 +248,7 @@ _uncased = (
     + _tamil
     + _telugu
     + _hangul
+    + _kana
     + _cjk
 )
 
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index d86662693..5f7ba5c10 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -6,16 +6,35 @@ from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
-    # fmt: off
-    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
-    # fmt: on
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "obl:agent",
+        "obl:arg",
+        "obl:mod",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
     doc = doclike.doc  # Ensure works on both Doc and Span.
     if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
-    np_deps = [doc.vocab.strings[label] for label in labels]
-    conj = doc.vocab.strings.add("conj")
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
     np_label = doc.vocab.strings.add("NP")
+    adj_label = doc.vocab.strings.add("amod")
+    det_label = doc.vocab.strings.add("det")
+    det_pos = doc.vocab.strings.add("DET")
+    adp_pos = doc.vocab.strings.add("ADP")
+    conj_label = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
     prev_end = -1
     for i, word in enumerate(doclike):
         if word.pos not in (NOUN, PROPN, PRON):
@@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
         if word.left_edge.i <= prev_end:
             continue
         if word.dep in np_deps:
-            prev_end = word.right_edge.i
-            yield word.left_edge.i, word.right_edge.i + 1, np_label
-        elif word.dep == conj:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
+
+            if right_child:
+                if (
+                    right_child.dep == adj_label
+                ):  # allow chain of adjectives by expanding to right
+                    right_end = right_child.right_edge
+                elif (
+                    right_child.dep == det_label and right_child.pos == det_pos
+                ):  # cut relative pronouns here
+                    right_end = right_child
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
+            else:
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_pos else left_index
+            )
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj_label:
             head = word.head
-            while head.dep == conj and head.head.i < head.i:
+            while head.dep == conj_label and head.head.i < head.i:
                 head = head.head
             # If the head is an NP, and we're coordinated to it, we're an NP
             if head.dep in np_deps:
-                prev_end = word.right_edge.i
-                yield word.left_edge.i, word.right_edge.i + 1, np_label
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
 
 
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index 1edebc837..ecf322bd7 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults
 from .lemmatizer import ItalianLemmatizer
+from .syntax_iterators import SYNTAX_ITERATORS
 
 
 class ItalianDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-    stop_words = STOP_WORDS
     prefixes = TOKENIZER_PREFIXES
     infixes = TOKENIZER_INFIXES
+    stop_words = STOP_WORDS
+    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Italian(Language):
diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py
new file mode 100644
index 000000000..f63df3fad
--- /dev/null
+++ b/spacy/lang/it/syntax_iterators.py
@@ -0,0 +1,86 @@
+from typing import Union, Iterator, Tuple
+
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "obl:agent",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
+    dets = ["det", "det:poss"]
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
+    np_label = doc.vocab.strings.add("NP")
+    adj_label = doc.vocab.strings.add("amod")
+    det_labels = {doc.vocab.strings.add(det) for det in dets}
+    det_pos = doc.vocab.strings.add("DET")
+    adp_label = doc.vocab.strings.add("ADP")
+    conj = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
+        if word.dep in np_deps:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
+
+            if right_child:
+                if (
+                    right_child.dep == adj_label
+                ):  # allow chain of adjectives by expanding to right
+                    right_end = right_child.right_edge
+                elif (
+                    right_child.dep in det_labels and right_child.pos == det_pos
+                ):  # cut relative pronouns here
+                    right_end = right_child
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
+            else:
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_label else left_index
+            )
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py
index fd65dd788..d9ed414ef 100644
--- a/spacy/lang/nb/stop_words.py
+++ b/spacy/lang/nb/stop_words.py
@@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av
 
 bak bare bedre beste blant ble bli blir blitt bris by både
 
-da dag de del dem den denne der dermed det dette disse drept du
+da dag de del dem den denne der dermed det dette disse du
 
 eller en enn er et ett etter
 
-fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
+fem fikk fire fjor flere folk for fortsatt fra fram
 funnet få får fått før først første
 
 gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
 
-ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
-hvorfor
+ha hadde ham han hans har hele helt henne hennes her hun
 
 i ifølge igjen ikke ingen inn
 
 ja jeg
 
 kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
-kvinner
 
-la laget land landet langt leder ligger like litt løpet lørdag
+la laget land landet langt leder ligger like litt løpet
 
-man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
-millioner minutter mot msci mye må mål måtte
+man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte
 
-ned neste noe noen nok norge norsk norske ntb ny nye nå når
+ned neste noe noen nok ny nye nå når
 
-og også om onsdag opp opplyser oslo oss over
+og også om opp opplyser oss over
 
-personer plass poeng politidistrikt politiet president prosent på
+personer plass poeng på
 
-regjeringen runde rundt russland
+runde rundt
 
-sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
+sa saken samme sammen samtidig satt se seg seks selv senere ser sett
 siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
-store står sverige svært så søndag
+store står svært så
 
-ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
-tyskland
+ta tatt tid tidligere til tilbake tillegg tok tror
 
-under usa ut uten utenfor
+under ut uten utenfor
 
 vant var ved veldig vi videre viktig vil ville viser vår være vært
 
diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py
index 6fb01a183..c9004ed5d 100644
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@@ -1,13 +1,10 @@
 # Source: https://github.com/stopwords-iso/stopwords-sl
-# TODO: probably needs to be tidied up – the list seems to have month names in
-# it, which shouldn't be considered stop words.
+# Removed various words that are not normally considered stop words, such as months.
 
 STOP_WORDS = set(
     """
 a
 ali
-april
-avgust
 b
 bi
 bil
@@ -19,7 +16,6 @@ biti
 blizu
 bo
 bodo
-bojo
 bolj
 bom
 bomo
@@ -37,16 +33,6 @@ da
 daleč
 dan
 danes
-datum
-december
-deset
-deseta
-deseti
-deseto
-devet
-deveta
-deveti
-deveto
 do
 dober
 dobra
@@ -54,16 +40,7 @@ dobri
 dobro
 dokler
 dol
-dolg
-dolga
-dolgi
 dovolj
-drug
-druga
-drugi
-drugo
-dva
-dve
 e
 eden
 en
@@ -74,7 +51,6 @@ enkrat
 eno
 etc.
 f
-februar
 g
 g.
 ga
@@ -93,16 +69,12 @@ iv
 ix
 iz
 j
-januar
 jaz
 je
 ji
 jih
 jim
 jo
-julij
-junij
-jutri
 k
 kadarkoli
 kaj
@@ -123,41 +95,23 @@ kje
 kjer
 kjerkoli
 ko
-koder
 koderkoli
 koga
 komu
 kot
-kratek
-kratka
-kratke
-kratki
 l
-lahka
-lahke
-lahki
-lahko
 le
 lep
 lepa
 lepe
 lepi
 lepo
-leto
 m
-maj
-majhen
-majhna
-majhni
-malce
-malo
 manj
-marec
 me
 med
 medtem
 mene
-mesec
 mi
 midva
 midve
@@ -183,7 +137,6 @@ najmanj
 naju
 največ
 nam
-narobe
 nas
 nato
 nazaj
@@ -192,7 +145,6 @@ naša
 naše
 ne
 nedavno
-nedelja
 nek
 neka
 nekaj
@@ -236,7 +188,6 @@ njuna
 njuno
 no
 nocoj
-november
 npr.
 o
 ob
@@ -244,51 +195,23 @@ oba
 obe
 oboje
 od
-odprt
-odprta
-odprti
 okoli
-oktober
 on
 onadva
 one
 oni
 onidve
-osem
-osma
-osmi
-osmo
 oz.
 p
 pa
-pet
-peta
-petek
-peti
-peto
 po
 pod
 pogosto
 poleg
-poln
-polna
-polni
-polno
 ponavadi
-ponedeljek
 ponovno
 potem
 povsod
-pozdravljen
-pozdravljeni
-prav
-prava
-prave
-pravi
-pravo
-prazen
-prazna
-prazno
 prbl.
 precej
 pred
@@ -297,19 +220,10 @@ preko
 pri
 pribl.
 približno
-primer
-pripravljen
-pripravljena
-pripravljeni
 proti
-prva
-prvi
-prvo
 r
-ravno
 redko
 res
-reč
 s
 saj
 sam
@@ -321,29 +235,17 @@ se
 sebe
 sebi
 sedaj
-sedem
-sedma
-sedmi
-sedmo
 sem
-september
 seveda
 si
 sicer
 skoraj
 skozi
-slab
 smo
 so
-sobota
 spet
-sreda
-srednja
-srednji
 sta
 ste
-stran
-stvar
 sva
 t
 ta
@@ -358,10 +260,6 @@ te
 tebe
 tebi
 tega
-težak
-težka
-težki
-težko
 ti
 tista
 tiste
@@ -371,11 +269,6 @@ tj.
 tja
 to
 toda
-torek
-tretja
-tretje
-tretji
-tri
 tu
 tudi
 tukaj
@@ -392,10 +285,6 @@ vaša
 vaše
 ve
 vedno
-velik
-velika
-veliki
-veliko
 vendar
 ves
 več
@@ -403,10 +292,6 @@ vi
 vidva
 vii
 viii
-visok
-visoka
-visoke
-visoki
 vsa
 vsaj
 vsak
@@ -420,34 +305,21 @@ vsega
 vsi
 vso
 včasih
-včeraj
 x
 z
 za
 zadaj
 zadnji
 zakaj
-zaprta
-zaprti
-zaprto
 zdaj
 zelo
 zunaj
 č
 če
 često
-četrta
-četrtek
-četrti
-četrto
 čez
 čigav
 š
-šest
-šesta
-šesti
-šesto
-štiri
 ž
 že
 """.split()
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index ffca79bb9..ee90a9f38 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -155,6 +155,11 @@ def fr_tokenizer():
     return get_lang_class("fr")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def fr_vocab():
+    return get_lang_class("fr")().vocab
+
+
 @pytest.fixture(scope="session")
 def ga_tokenizer():
     return get_lang_class("ga")().tokenizer
@@ -205,6 +210,11 @@ def it_tokenizer():
     return get_lang_class("it")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def it_vocab():
+    return get_lang_class("it")().vocab
+
+
 @pytest.fixture(scope="session")
 def ja_tokenizer():
     pytest.importorskip("sudachipy")
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index 48ac88ead..25b95f566 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -1,8 +1,230 @@
+from spacy.tokens import Doc
 import pytest
 
 
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # determiner + noun
+        # un nom -> un nom
+        (
+            ["un", "nom"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # determiner + noun starting with vowel
+        # l'heure -> l'heure
+        (
+            ["l'", "heure"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # determiner + plural noun
+        # les romans -> les romans
+        (
+            ["les", "romans"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # det + adj + noun
+        # Le vieux Londres  -> Le vieux Londres 
+        (
+            ['Les', 'vieux', 'Londres'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # det + noun + adj
+        # le nom propre  -> le nom propre   a proper noun
+        (
+            ["le", "nom", "propre"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # det + noun + adj plural
+        # Les chiens bruns  -> les chiens bruns
+        (
+            ["Les", "chiens", "bruns"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # multiple adjectives: one adj before the noun, one adj after the noun
+        # un nouveau film intéressant -> un nouveau film intéressant
+        (
+            ["un", "nouveau", "film", "intéressant"],
+            [2, 2, 2, 2],
+            ["det", "amod", "ROOT", "amod"],
+            ["DET", "ADJ", "NOUN", "ADJ"],
+            [(0,4)]
+        ),
+        # multiple adjectives, both adjs after the noun
+        # une personne intelligente et drôle -> une personne intelligente et drôle
+        (
+            ["une", "personne", "intelligente", "et", "drôle"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # relative pronoun
+        # un bus qui va au ville -> un bus, qui, ville
+        (
+            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
+            [1, 1, 3, 1, 5, 3],
+            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
+            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
+            [(0,2), (2,3), (5,6)]
+        ),
+        # relative subclause
+        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
+        (
+            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
+            [0, 2, 0, 5, 5, 2, 5],
+            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
+            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
+            [(1,3), (4,5)]
+        ),
+        # Person name and title by flat
+        # Louis XIV -> Louis XIV
+        (
+            ["Louis", "XIV"],
+            [0, 0],
+            ["ROOT", "flat:name"],
+            ["PROPN", "PROPN"],
+            [(0,2)]
+        ),
+        # Organization name by flat
+        # Nations Unies -> Nations Unies
+        (
+            ["Nations", "Unies"],
+            [0, 0],
+            ["ROOT", "flat:name"],
+            ["PROPN", "PROPN"],
+            [(0,2)]
+        ),
+        # Noun compound, person name created by two flats
+        # Louise de Bratagne -> Louise de Bratagne
+        (
+            ["Louise", "de", "Bratagne"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Noun compound, person name created by two flats
+        # Louis François Joseph -> Louis François Joseph
+        (
+            ["Louis", "François", "Joseph"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # one determiner + one noun + one adjective qualified by an adverb
+        # quelques agriculteurs très riches -> quelques agriculteurs très riches
+        (
+            ["quelques", "agriculteurs", "très", "riches"],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'advmod', 'amod'],
+            ['DET', 'NOUN', 'ADV', 'ADJ'],
+            [(0,4)]
+        ),
+        # Two NPs conjuncted
+        # Il a un chien et un chat -> Il, un chien, un chat
+        ( 
+            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
+            [1, 1, 3, 1, 6, 6, 3],
+            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,1), (2,4), (5,7)]
+         
+        ),
+        # Two NPs together
+        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
+        (
+            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
+            [1, 1, 1, 1, 3],
+            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
+            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+            [(0, 3), (3, 5)]
+        ),
+        # nmod relation between NPs
+        # la destruction de la ville -> la destruction, la ville
+        (
+            ['la', 'destruction', 'de', 'la', 'ville'],
+            [1, 1, 4, 4, 1],
+            ['det', 'ROOT', 'case', 'det', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
+            [(0,2), (3,5)]
+        ),
+        # nmod relation between NPs
+        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
+        (
+            ['Archiduchesse', 'd’', 'Autriche'],
+            [0, 2, 0],
+            ['ROOT', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3)]
+        ),
+        # Compounding by nmod, several NPs chained together
+        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
+        (
+            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
+            [2, 2, 2, 4, 2, 6, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(0, 3), (4, 5), (6, 7)]
+        ),
+        # several NPs
+        # Traduction du rapport de Susana -> Traduction, rapport, Susana
+        (
+            ['Traduction', 'du', 'raport', 'de', 'Susana'],
+            [0, 2, 0, 4, 2],
+            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3), (4,5)]  
+       
+        ),
+        # Several NPs
+        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
+        (  
+            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
+            [2, 2, 2, 4, 2, 7, 7, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,8)]
+        ),
+        # Passive subject
+        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
+        (
+            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
+            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
+            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
+            [(0, 3), (6, 10), (11, 12)]
+        )
+    ],
+)
+# fmt: on
+def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
     """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
-    doc = fr_tokenizer("trouver des travaux antérieurs")
+    doc = fr_tokenizer("Je suis allé à l'école")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py
new file mode 100644
index 000000000..0a8c10e79
--- /dev/null
+++ b/spacy/tests/lang/it/test_noun_chunks.py
@@ -0,0 +1,221 @@
+from spacy.tokens import Doc
+import pytest
+
+
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # determiner + noun
+        # un pollo -> un pollo
+        (
+            ["un", "pollo"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0,2)],
+        ),
+        # two determiners + noun
+        # il mio cane -> il mio cane
+        (
+            ["il", "mio", "cane"],
+            [2, 2, 2],
+            ["det", "det:poss", "ROOT"],
+            ["DET", "DET", "NOUN"],
+            [(0,3)],
+        ),
+        # two determiners, one is after noun. rare usage but still testing
+        # il cane mio-> il cane mio
+        (
+            ["il", "cane", "mio"],
+            [1, 1, 1],
+            ["det", "ROOT", "det:poss"],
+            ["DET", "NOUN", "DET"],
+            [(0,3)],
+        ),
+        # relative pronoun
+        # È molto bello il vestito che hai acquistat -> il vestito, che   the dress that you bought is very pretty.
+        (
+            ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
+            [2, 2, 2, 4, 2, 7, 7, 4],
+            ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
+            ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
+            [(3,5), (5,6)]
+        ),
+        # relative subclause
+        # il computer che hai comprato -> il computer, che     the computer that you bought
+        (
+            ['il', 'computer', 'che', 'hai', 'comprato'],
+            [1, 1, 4, 4, 1],
+            ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
+            ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
+            [(0,2), (2,3)]
+        ),
+        # det + noun + adj
+        # Una macchina grande  -> Una macchina grande
+        (
+            ["Una", "macchina", "grande"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0,3)],
+        ),
+        # noun + adj plural
+        # mucche bianche 
+        (
+            ["mucche", "bianche"],
+            [0, 0],
+            ["ROOT", "amod"],
+            ["NOUN", "ADJ"],
+            [(0,2)],
+        ),
+        # det + adj + noun
+        # Una grande macchina -> Una grande macchina
+        (
+            ['Una', 'grande', 'macchina'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # det + adj + noun, det with apostrophe
+        # un'importante associazione -> un'importante associazione
+        (
+            ["Un'", 'importante', 'associazione'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # multiple adjectives
+        # Un cane piccolo e marrone -> Un cane piccolo e marrone
+        (
+            ["Un", "cane", "piccolo", "e", "marrone"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # determiner, adjective, compound created by flat
+        # le Nazioni Unite -> le Nazioni Unite
+        (
+            ["le", "Nazioni", "Unite"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat:name"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # one determiner + one noun + one adjective qualified by an adverb
+        # alcuni contadini molto ricchi -> alcuni contadini molto ricchi     some very rich farmers
+        (
+            ['alcuni', 'contadini', 'molto', 'ricchi'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'advmod', 'amod'],
+            ['DET', 'NOUN', 'ADV', 'ADJ'],
+            [(0,4)]
+        ),
+        # Two NPs conjuncted
+        # Ho un cane e un gatto -> un cane, un gatto
+        ( 
+            ['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
+            [0, 2, 0, 5, 5, 0],
+            ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+            ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+            [(1,3), (4,6)]
+         
+        ),
+        # Two NPs together
+        # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
+        (
+            ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
+            [1, 1, 1, 1, 3],
+            ['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
+            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+            [(0, 3), (3, 5)]
+        ),
+        # Noun compound, person name and titles
+        # Dom Pedro II -> Dom Pedro II
+        (
+            ["Dom", "Pedro", "II"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Noun compound created by flat
+        # gli Stati Uniti
+        (
+            ["gli", "Stati", "Uniti"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat:name"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # nmod relation between NPs
+        # la distruzione della città -> la distruzione, città
+        (
+            ['la', 'distruzione', 'della', 'città'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'case', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'NOUN'],
+            [(0,2), (3,4)]
+        ),
+        # Compounding by nmod, several NPs chained together
+        # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
+        (
+            ["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
+            [2, 2, 2, 4, 2, 6, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(0, 3), (4, 5), (6, 7)]
+        ),
+        # several NPs
+        # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
+        (
+            ['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
+            [0, 2, 0, 4, 2],
+            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3), (4,5)]  
+       
+        ),
+        # Several NPs
+        # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
+        (  
+            ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
+            [1, 1, 1, 4, 1, 8, 8, 8, 1],
+            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
+            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,9)]
+        ),
+        # Passive subject
+        # La nuova spesa è alimentata dal grande conto in banca di Clinton  -> Le nuova spesa, grande conto, banca, Clinton
+        (
+            ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
+            [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
+            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0, 3), (6, 8), (9, 10), (11,12)]
+        ),
+        # Misc
+        # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
+        (
+            ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
+            [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
+            ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
+            ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(2,4), (9,12), (13,14), (17,18), (19,20)]
+        )
+    ],
+)
+# fmt: on
+def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
+def test_noun_chunks_is_parsed_it(it_tokenizer):
+    """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
+    doc = it_tokenizer("Sei andato a Oxford")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)

From 6a8619dd736f03e0fa8eec173a9277a3adbc46f9 Mon Sep 17 00:00:00 2001
From: Tuomo Hiippala <tuomo.hiippala@iki.fi>
Date: Mon, 17 Jan 2022 09:28:51 +0200
Subject: [PATCH 04/16] Update the entry for Applied Language Technology in
 spaCy Universe (#10068)

* add entry for Applied Language Technology under "Courses"

Added the following entry into `universe.json`:

```
        {
            "type": "education",
            "id": "applt-course",
            "title": "Applied Language Technology",
            "slogan": "NLP for newcomers using spaCy and Stanza",
            "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
            "url": "https://applied-language-technology.readthedocs.io/",
            "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
            "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png",
            "author": "Tuomo Hiippala",
            "author_links": {
                "twitter": "tuomo_h",
                "github": "thiippal",
                "website": "https://www.mv.helsinki.fi/home/thiippal/"
            },
            "category": ["courses"]
        },
```

* Update the entry for "Applied Language Technology"
---
 website/meta/universe.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 384a7e070..0fde2d612 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1770,9 +1770,9 @@
             "title": "Applied Language Technology",
             "slogan": "NLP for newcomers using spaCy and Stanza",
             "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
-            "url": "https://applied-language-technology.readthedocs.io/",
+            "url": "https://applied-language-technology.mooc.fi",
             "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
-            "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png",
+            "thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png",
             "author": "Tuomo Hiippala",
             "author_links": {
                 "twitter": "tuomo_h",

From add52935ff273c9c8f37ae244803aebe02c12193 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 17 Jan 2022 10:38:37 +0100
Subject: [PATCH 05/16] Revert "Bump sudachipy version (#9917)" (#10071)

This reverts commit 58bdd8607bb917f3437fdf5993dec5b6e58930c8.
---
 setup.cfg                               |   4 +-
 spacy/lang/char_classes.py              |   5 -
 spacy/lang/fr/syntax_iterators.py       |  72 ++------
 spacy/lang/it/__init__.py               |   4 +-
 spacy/lang/it/syntax_iterators.py       |  86 ---------
 spacy/lang/nb/stop_words.py             |  30 ++--
 spacy/lang/sl/stop_words.py             | 130 +++++++++++++-
 spacy/tests/conftest.py                 |  10 --
 spacy/tests/lang/fr/test_noun_chunks.py | 224 +-----------------------
 spacy/tests/lang/it/test_noun_chunks.py | 221 -----------------------
 10 files changed, 162 insertions(+), 624 deletions(-)
 delete mode 100644 spacy/lang/it/syntax_iterators.py
 delete mode 100644 spacy/tests/lang/it/test_noun_chunks.py

diff --git a/setup.cfg b/setup.cfg
index 586a044ff..50e982cbf 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -108,8 +108,8 @@ apple =
     thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
-    sudachipy>=0.5.2,!=0.6.1
-    sudachidict_core>=20211220
+    sudachipy>=0.4.9
+    sudachidict_core>=20200330
 ko =
     natto-py==0.9.0
 th =
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index b15bb3cf3..9e5441a4f 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -45,10 +45,6 @@ _hangul_syllables = r"\uAC00-\uD7AF"
 _hangul_jamo = r"\u1100-\u11FF"
 _hangul = _hangul_syllables + _hangul_jamo
 
-_hiragana = r"\u3040-\u309F"
-_katakana = r"\u30A0-\u30FFー"
-_kana = _hiragana + _katakana
-
 # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
 _latin_u_extendedA = (
     r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@@ -248,7 +244,6 @@ _uncased = (
     + _tamil
     + _telugu
     + _hangul
-    + _kana
     + _cjk
 )
 
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 5f7ba5c10..d86662693 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -6,35 +6,16 @@ from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """
-    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
-    """
-    labels = [
-        "nsubj",
-        "nsubj:pass",
-        "obj",
-        "obl",
-        "obl:agent",
-        "obl:arg",
-        "obl:mod",
-        "nmod",
-        "pcomp",
-        "appos",
-        "ROOT",
-    ]
-    post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
+    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
+    # fmt: off
+    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
+    # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
     if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
-    np_deps = {doc.vocab.strings.add(label) for label in labels}
-    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add("conj")
     np_label = doc.vocab.strings.add("NP")
-    adj_label = doc.vocab.strings.add("amod")
-    det_label = doc.vocab.strings.add("det")
-    det_pos = doc.vocab.strings.add("DET")
-    adp_pos = doc.vocab.strings.add("ADP")
-    conj_label = doc.vocab.strings.add("conj")
-    conj_pos = doc.vocab.strings.add("CCONJ")
     prev_end = -1
     for i, word in enumerate(doclike):
         if word.pos not in (NOUN, PROPN, PRON):
@@ -43,45 +24,16 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
         if word.left_edge.i <= prev_end:
             continue
         if word.dep in np_deps:
-            right_childs = list(word.rights)
-            right_child = right_childs[0] if right_childs else None
-
-            if right_child:
-                if (
-                    right_child.dep == adj_label
-                ):  # allow chain of adjectives by expanding to right
-                    right_end = right_child.right_edge
-                elif (
-                    right_child.dep == det_label and right_child.pos == det_pos
-                ):  # cut relative pronouns here
-                    right_end = right_child
-                elif right_child.dep in np_modifs:  # Check if we can expand to right
-                    right_end = word.right_edge
-                else:
-                    right_end = word
-            else:
-                right_end = word
-            prev_end = right_end.i
-
-            left_index = word.left_edge.i
-            left_index = (
-                left_index + 1 if word.left_edge.pos == adp_pos else left_index
-            )
-
-            yield left_index, right_end.i + 1, np_label
-        elif word.dep == conj_label:
+            prev_end = word.right_edge.i
+            yield word.left_edge.i, word.right_edge.i + 1, np_label
+        elif word.dep == conj:
             head = word.head
-            while head.dep == conj_label and head.head.i < head.i:
+            while head.dep == conj and head.head.i < head.i:
                 head = head.head
             # If the head is an NP, and we're coordinated to it, we're an NP
             if head.dep in np_deps:
-                prev_end = word.i
-
-                left_index = word.left_edge.i  # eliminate left attached conjunction
-                left_index = (
-                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
-                )
-                yield left_index, word.i + 1, np_label
+                prev_end = word.right_edge.i
+                yield word.left_edge.i, word.right_edge.i + 1, np_label
 
 
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index ecf322bd7..1edebc837 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -6,15 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults
 from .lemmatizer import ItalianLemmatizer
-from .syntax_iterators import SYNTAX_ITERATORS
 
 
 class ItalianDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    stop_words = STOP_WORDS
     prefixes = TOKENIZER_PREFIXES
     infixes = TOKENIZER_INFIXES
-    stop_words = STOP_WORDS
-    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Italian(Language):
diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py
deleted file mode 100644
index f63df3fad..000000000
--- a/spacy/lang/it/syntax_iterators.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from typing import Union, Iterator, Tuple
-
-from ...symbols import NOUN, PROPN, PRON
-from ...errors import Errors
-from ...tokens import Doc, Span
-
-
-def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """
-    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
-    """
-    labels = [
-        "nsubj",
-        "nsubj:pass",
-        "obj",
-        "obl",
-        "obl:agent",
-        "nmod",
-        "pcomp",
-        "appos",
-        "ROOT",
-    ]
-    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
-    dets = ["det", "det:poss"]
-    doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.has_annotation("DEP"):
-        raise ValueError(Errors.E029)
-    np_deps = {doc.vocab.strings.add(label) for label in labels}
-    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
-    np_label = doc.vocab.strings.add("NP")
-    adj_label = doc.vocab.strings.add("amod")
-    det_labels = {doc.vocab.strings.add(det) for det in dets}
-    det_pos = doc.vocab.strings.add("DET")
-    adp_label = doc.vocab.strings.add("ADP")
-    conj = doc.vocab.strings.add("conj")
-    conj_pos = doc.vocab.strings.add("CCONJ")
-    prev_end = -1
-    for i, word in enumerate(doclike):
-        if word.pos not in (NOUN, PROPN, PRON):
-            continue
-        # Prevent nested chunks from being produced
-        if word.left_edge.i <= prev_end:
-            continue
-        if word.dep in np_deps:
-            right_childs = list(word.rights)
-            right_child = right_childs[0] if right_childs else None
-
-            if right_child:
-                if (
-                    right_child.dep == adj_label
-                ):  # allow chain of adjectives by expanding to right
-                    right_end = right_child.right_edge
-                elif (
-                    right_child.dep in det_labels and right_child.pos == det_pos
-                ):  # cut relative pronouns here
-                    right_end = right_child
-                elif right_child.dep in np_modifs:  # Check if we can expand to right
-                    right_end = word.right_edge
-                else:
-                    right_end = word
-            else:
-                right_end = word
-            prev_end = right_end.i
-
-            left_index = word.left_edge.i
-            left_index = (
-                left_index + 1 if word.left_edge.pos == adp_label else left_index
-            )
-
-            yield left_index, right_end.i + 1, np_label
-        elif word.dep == conj:
-            head = word.head
-            while head.dep == conj and head.head.i < head.i:
-                head = head.head
-            # If the head is an NP, and we're coordinated to it, we're an NP
-            if head.dep in np_deps:
-                prev_end = word.i
-
-                left_index = word.left_edge.i  # eliminate left attached conjunction
-                left_index = (
-                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
-                )
-                yield left_index, word.i + 1, np_label
-
-
-SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py
index d9ed414ef..fd65dd788 100644
--- a/spacy/lang/nb/stop_words.py
+++ b/spacy/lang/nb/stop_words.py
@@ -4,42 +4,46 @@ alle allerede alt and andre annen annet at av
 
 bak bare bedre beste blant ble bli blir blitt bris by både
 
-da dag de del dem den denne der dermed det dette disse du
+da dag de del dem den denne der dermed det dette disse drept du
 
 eller en enn er et ett etter
 
-fem fikk fire fjor flere folk for fortsatt fra fram
+fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
 funnet få får fått før først første
 
 gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
 
-ha hadde ham han hans har hele helt henne hennes her hun
+ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
+hvorfor
 
 i ifølge igjen ikke ingen inn
 
 ja jeg
 
 kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
+kvinner
 
-la laget land landet langt leder ligger like litt løpet
+la laget land landet langt leder ligger like litt løpet lørdag
 
-man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte
+man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
+millioner minutter mot msci mye må mål måtte
 
-ned neste noe noen nok ny nye nå når
+ned neste noe noen nok norge norsk norske ntb ny nye nå når
 
-og også om opp opplyser oss over
+og også om onsdag opp opplyser oslo oss over
 
-personer plass poeng på
+personer plass poeng politidistrikt politiet president prosent på
 
-runde rundt
+regjeringen runde rundt russland
 
-sa saken samme sammen samtidig satt se seg seks selv senere ser sett
+sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
 siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
-store står svært så
+store står sverige svært så søndag
 
-ta tatt tid tidligere til tilbake tillegg tok tror
+ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
+tyskland
 
-under ut uten utenfor
+under usa ut uten utenfor
 
 vant var ved veldig vi videre viktig vil ville viser vår være vært
 
diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py
index c9004ed5d..6fb01a183 100644
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@@ -1,10 +1,13 @@
 # Source: https://github.com/stopwords-iso/stopwords-sl
-# Removed various words that are not normally considered stop words, such as months.
+# TODO: probably needs to be tidied up – the list seems to have month names in
+# it, which shouldn't be considered stop words.
 
 STOP_WORDS = set(
     """
 a
 ali
+april
+avgust
 b
 bi
 bil
@@ -16,6 +19,7 @@ biti
 blizu
 bo
 bodo
+bojo
 bolj
 bom
 bomo
@@ -33,6 +37,16 @@ da
 daleč
 dan
 danes
+datum
+december
+deset
+deseta
+deseti
+deseto
+devet
+deveta
+deveti
+deveto
 do
 dober
 dobra
@@ -40,7 +54,16 @@ dobri
 dobro
 dokler
 dol
+dolg
+dolga
+dolgi
 dovolj
+drug
+druga
+drugi
+drugo
+dva
+dve
 e
 eden
 en
@@ -51,6 +74,7 @@ enkrat
 eno
 etc.
 f
+februar
 g
 g.
 ga
@@ -69,12 +93,16 @@ iv
 ix
 iz
 j
+januar
 jaz
 je
 ji
 jih
 jim
 jo
+julij
+junij
+jutri
 k
 kadarkoli
 kaj
@@ -95,23 +123,41 @@ kje
 kjer
 kjerkoli
 ko
+koder
 koderkoli
 koga
 komu
 kot
+kratek
+kratka
+kratke
+kratki
 l
+lahka
+lahke
+lahki
+lahko
 le
 lep
 lepa
 lepe
 lepi
 lepo
+leto
 m
+maj
+majhen
+majhna
+majhni
+malce
+malo
 manj
+marec
 me
 med
 medtem
 mene
+mesec
 mi
 midva
 midve
@@ -137,6 +183,7 @@ najmanj
 naju
 največ
 nam
+narobe
 nas
 nato
 nazaj
@@ -145,6 +192,7 @@ naša
 naše
 ne
 nedavno
+nedelja
 nek
 neka
 nekaj
@@ -188,6 +236,7 @@ njuna
 njuno
 no
 nocoj
+november
 npr.
 o
 ob
@@ -195,23 +244,51 @@ oba
 obe
 oboje
 od
+odprt
+odprta
+odprti
 okoli
+oktober
 on
 onadva
 one
 oni
 onidve
+osem
+osma
+osmi
+osmo
 oz.
 p
 pa
+pet
+peta
+petek
+peti
+peto
 po
 pod
 pogosto
 poleg
+poln
+polna
+polni
+polno
 ponavadi
+ponedeljek
 ponovno
 potem
 povsod
+pozdravljen
+pozdravljeni
+prav
+prava
+prave
+pravi
+pravo
+prazen
+prazna
+prazno
 prbl.
 precej
 pred
@@ -220,10 +297,19 @@ preko
 pri
 pribl.
 približno
+primer
+pripravljen
+pripravljena
+pripravljeni
 proti
+prva
+prvi
+prvo
 r
+ravno
 redko
 res
+reč
 s
 saj
 sam
@@ -235,17 +321,29 @@ se
 sebe
 sebi
 sedaj
+sedem
+sedma
+sedmi
+sedmo
 sem
+september
 seveda
 si
 sicer
 skoraj
 skozi
+slab
 smo
 so
+sobota
 spet
+sreda
+srednja
+srednji
 sta
 ste
+stran
+stvar
 sva
 t
 ta
@@ -260,6 +358,10 @@ te
 tebe
 tebi
 tega
+težak
+težka
+težki
+težko
 ti
 tista
 tiste
@@ -269,6 +371,11 @@ tj.
 tja
 to
 toda
+torek
+tretja
+tretje
+tretji
+tri
 tu
 tudi
 tukaj
@@ -285,6 +392,10 @@ vaša
 vaše
 ve
 vedno
+velik
+velika
+veliki
+veliko
 vendar
 ves
 več
@@ -292,6 +403,10 @@ vi
 vidva
 vii
 viii
+visok
+visoka
+visoke
+visoki
 vsa
 vsaj
 vsak
@@ -305,21 +420,34 @@ vsega
 vsi
 vso
 včasih
+včeraj
 x
 z
 za
 zadaj
 zadnji
 zakaj
+zaprta
+zaprti
+zaprto
 zdaj
 zelo
 zunaj
 č
 če
 često
+četrta
+četrtek
+četrti
+četrto
 čez
 čigav
 š
+šest
+šesta
+šesti
+šesto
+štiri
 ž
 že
 """.split()
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index ee90a9f38..ffca79bb9 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -155,11 +155,6 @@ def fr_tokenizer():
     return get_lang_class("fr")().tokenizer
 
 
-@pytest.fixture(scope="session")
-def fr_vocab():
-    return get_lang_class("fr")().vocab
-
-
 @pytest.fixture(scope="session")
 def ga_tokenizer():
     return get_lang_class("ga")().tokenizer
@@ -210,11 +205,6 @@ def it_tokenizer():
     return get_lang_class("it")().tokenizer
 
 
-@pytest.fixture(scope="session")
-def it_vocab():
-    return get_lang_class("it")().vocab
-
-
 @pytest.fixture(scope="session")
 def ja_tokenizer():
     pytest.importorskip("sudachipy")
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index 25b95f566..48ac88ead 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -1,230 +1,8 @@
-from spacy.tokens import Doc
 import pytest
 
 
-# fmt: off
-@pytest.mark.parametrize(
-    "words,heads,deps,pos,chunk_offsets",
-    [
-        # determiner + noun
-        # un nom -> un nom
-        (
-            ["un", "nom"],
-            [1, 1],
-            ["det", "ROOT"],
-            ["DET", "NOUN"],
-            [(0, 2)],
-        ),
-        # determiner + noun starting with vowel
-        # l'heure -> l'heure
-        (
-            ["l'", "heure"],
-            [1, 1],
-            ["det", "ROOT"],
-            ["DET", "NOUN"],
-            [(0, 2)],
-        ),
-        # determiner + plural noun
-        # les romans -> les romans
-        (
-            ["les", "romans"],
-            [1, 1],
-            ["det", "ROOT"],
-            ["DET", "NOUN"],
-            [(0, 2)],
-        ),
-        # det + adj + noun
-        # Le vieux Londres  -> Le vieux Londres 
-        (
-            ['Les', 'vieux', 'Londres'],
-            [2, 2, 2],
-            ["det", "amod", "ROOT"],
-            ["DET", "ADJ", "NOUN"],
-            [(0,3)]
-        ),
-        # det + noun + adj
-        # le nom propre  -> le nom propre   a proper noun
-        (
-            ["le", "nom", "propre"],
-            [1, 1, 1],
-            ["det", "ROOT", "amod"],
-            ["DET", "NOUN", "ADJ"],
-            [(0, 3)],
-        ),
-        # det + noun + adj plural
-        # Les chiens bruns  -> les chiens bruns
-        (
-            ["Les", "chiens", "bruns"],
-            [1, 1, 1],
-            ["det", "ROOT", "amod"],
-            ["DET", "NOUN", "ADJ"],
-            [(0, 3)],
-        ),
-        # multiple adjectives: one adj before the noun, one adj after the noun
-        # un nouveau film intéressant -> un nouveau film intéressant
-        (
-            ["un", "nouveau", "film", "intéressant"],
-            [2, 2, 2, 2],
-            ["det", "amod", "ROOT", "amod"],
-            ["DET", "ADJ", "NOUN", "ADJ"],
-            [(0,4)]
-        ),
-        # multiple adjectives, both adjs after the noun
-        # une personne intelligente et drôle -> une personne intelligente et drôle
-        (
-            ["une", "personne", "intelligente", "et", "drôle"],
-            [1, 1, 1, 4, 2],
-            ["det", "ROOT", "amod", "cc", "conj"],
-            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
-            [(0,5)]
-        ),
-        # relative pronoun
-        # un bus qui va au ville -> un bus, qui, ville
-        (
-            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
-            [1, 1, 3, 1, 5, 3],
-            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
-            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
-            [(0,2), (2,3), (5,6)]
-        ),
-        # relative subclause
-        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
-        (
-            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
-            [0, 2, 0, 5, 5, 2, 5],
-            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
-            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
-            [(1,3), (4,5)]
-        ),
-        # Person name and title by flat
-        # Louis XIV -> Louis XIV
-        (
-            ["Louis", "XIV"],
-            [0, 0],
-            ["ROOT", "flat:name"],
-            ["PROPN", "PROPN"],
-            [(0,2)]
-        ),
-        # Organization name by flat
-        # Nations Unies -> Nations Unies
-        (
-            ["Nations", "Unies"],
-            [0, 0],
-            ["ROOT", "flat:name"],
-            ["PROPN", "PROPN"],
-            [(0,2)]
-        ),
-        # Noun compound, person name created by two flats
-        # Louise de Bratagne -> Louise de Bratagne
-        (
-            ["Louise", "de", "Bratagne"],
-            [0, 0, 0],
-            ["ROOT", "flat:name", "flat:name"],
-            ["PROPN", "PROPN", "PROPN"],
-            [(0,3)]
-        ),
-        # Noun compound, person name created by two flats
-        # Louis François Joseph -> Louis François Joseph
-        (
-            ["Louis", "François", "Joseph"],
-            [0, 0, 0],
-            ["ROOT", "flat:name", "flat:name"],
-            ["PROPN", "PROPN", "PROPN"],
-            [(0,3)]
-        ),
-        # one determiner + one noun + one adjective qualified by an adverb
-        # quelques agriculteurs très riches -> quelques agriculteurs très riches
-        (
-            ["quelques", "agriculteurs", "très", "riches"],
-            [1, 1, 3, 1],
-            ['det', 'ROOT', 'advmod', 'amod'],
-            ['DET', 'NOUN', 'ADV', 'ADJ'],
-            [(0,4)]
-        ),
-        # Two NPs conjuncted
-        # Il a un chien et un chat -> Il, un chien, un chat
-        ( 
-            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
-            [1, 1, 3, 1, 6, 6, 3],
-            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
-            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
-            [(0,1), (2,4), (5,7)]
-         
-        ),
-        # Two NPs together
-        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
-        (
-            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
-            [1, 1, 1, 1, 3],
-            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
-            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
-            [(0, 3), (3, 5)]
-        ),
-        # nmod relation between NPs
-        # la destruction de la ville -> la destruction, la ville
-        (
-            ['la', 'destruction', 'de', 'la', 'ville'],
-            [1, 1, 4, 4, 1],
-            ['det', 'ROOT', 'case', 'det', 'nmod'],
-            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
-            [(0,2), (3,5)]
-        ),
-        # nmod relation between NPs
-        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
-        (
-            ['Archiduchesse', 'd’', 'Autriche'],
-            [0, 2, 0],
-            ['ROOT', 'case', 'nmod'],
-            ['NOUN', 'ADP', 'PROPN'],
-            [(0,1), (2,3)]
-        ),
-        # Compounding by nmod, several NPs chained together
-        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
-        (
-            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
-            [2, 2, 2, 4, 2, 6, 2],
-            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
-            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
-            [(0, 3), (4, 5), (6, 7)]
-        ),
-        # several NPs
-        # Traduction du rapport de Susana -> Traduction, rapport, Susana
-        (
-            ['Traduction', 'du', 'raport', 'de', 'Susana'],
-            [0, 2, 0, 4, 2],
-            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
-            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
-            [(0,1), (2,3), (4,5)]  
-       
-        ),
-        # Several NPs
-        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
-        (  
-            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
-            [2, 2, 2, 4, 2, 7, 7, 2],
-            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
-            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
-            [(0,3), (4,5), (6,8)]
-        ),
-        # Passive subject
-        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
-        (
-            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
-            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
-            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
-            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
-            [(0, 3), (6, 10), (11, 12)]
-        )
-    ],
-)
-# fmt: on
-def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
-    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
-    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
-
-
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
     """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
-    doc = fr_tokenizer("Je suis allé à l'école")
+    doc = fr_tokenizer("trouver des travaux antérieurs")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py
deleted file mode 100644
index 0a8c10e79..000000000
--- a/spacy/tests/lang/it/test_noun_chunks.py
+++ /dev/null
@@ -1,221 +0,0 @@
-from spacy.tokens import Doc
-import pytest
-
-
-# fmt: off
-@pytest.mark.parametrize(
-    "words,heads,deps,pos,chunk_offsets",
-    [
-        # determiner + noun
-        # un pollo -> un pollo
-        (
-            ["un", "pollo"],
-            [1, 1],
-            ["det", "ROOT"],
-            ["DET", "NOUN"],
-            [(0,2)],
-        ),
-        # two determiners + noun
-        # il mio cane -> il mio cane
-        (
-            ["il", "mio", "cane"],
-            [2, 2, 2],
-            ["det", "det:poss", "ROOT"],
-            ["DET", "DET", "NOUN"],
-            [(0,3)],
-        ),
-        # two determiners, one is after noun. rare usage but still testing
-        # il cane mio-> il cane mio
-        (
-            ["il", "cane", "mio"],
-            [1, 1, 1],
-            ["det", "ROOT", "det:poss"],
-            ["DET", "NOUN", "DET"],
-            [(0,3)],
-        ),
-        # relative pronoun
-        # È molto bello il vestito che hai acquistat -> il vestito, che   the dress that you bought is very pretty.
-        (
-            ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
-            [2, 2, 2, 4, 2, 7, 7, 4],
-            ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
-            ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
-            [(3,5), (5,6)]
-        ),
-        # relative subclause
-        # il computer che hai comprato -> il computer, che     the computer that you bought
-        (
-            ['il', 'computer', 'che', 'hai', 'comprato'],
-            [1, 1, 4, 4, 1],
-            ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
-            ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
-            [(0,2), (2,3)]
-        ),
-        # det + noun + adj
-        # Una macchina grande  -> Una macchina grande
-        (
-            ["Una", "macchina", "grande"],
-            [1, 1, 1],
-            ["det", "ROOT", "amod"],
-            ["DET", "NOUN", "ADJ"],
-            [(0,3)],
-        ),
-        # noun + adj plural
-        # mucche bianche 
-        (
-            ["mucche", "bianche"],
-            [0, 0],
-            ["ROOT", "amod"],
-            ["NOUN", "ADJ"],
-            [(0,2)],
-        ),
-        # det + adj + noun
-        # Una grande macchina -> Una grande macchina
-        (
-            ['Una', 'grande', 'macchina'],
-            [2, 2, 2],
-            ["det", "amod", "ROOT"],
-            ["DET", "ADJ", "NOUN"],
-            [(0,3)]
-        ),
-        # det + adj + noun, det with apostrophe
-        # un'importante associazione -> un'importante associazione
-        (
-            ["Un'", 'importante', 'associazione'],
-            [2, 2, 2],
-            ["det", "amod", "ROOT"],
-            ["DET", "ADJ", "NOUN"],
-            [(0,3)]
-        ),
-        # multiple adjectives
-        # Un cane piccolo e marrone -> Un cane piccolo e marrone
-        (
-            ["Un", "cane", "piccolo", "e", "marrone"],
-            [1, 1, 1, 4, 2],
-            ["det", "ROOT", "amod", "cc", "conj"],
-            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
-            [(0,5)]
-        ),
-        # determiner, adjective, compound created by flat
-        # le Nazioni Unite -> le Nazioni Unite
-        (
-            ["le", "Nazioni", "Unite"],
-            [1, 1, 1],
-            ["det", "ROOT", "flat:name"],
-            ["DET", "PROPN", "PROPN"],
-            [(0,3)]
-        ),
-        # one determiner + one noun + one adjective qualified by an adverb
-        # alcuni contadini molto ricchi -> alcuni contadini molto ricchi     some very rich farmers
-        (
-            ['alcuni', 'contadini', 'molto', 'ricchi'],
-            [1, 1, 3, 1],
-            ['det', 'ROOT', 'advmod', 'amod'],
-            ['DET', 'NOUN', 'ADV', 'ADJ'],
-            [(0,4)]
-        ),
-        # Two NPs conjuncted
-        # Ho un cane e un gatto -> un cane, un gatto
-        ( 
-            ['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
-            [0, 2, 0, 5, 5, 0],
-            ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
-            ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
-            [(1,3), (4,6)]
-         
-        ),
-        # Two NPs together
-        # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
-        (
-            ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
-            [1, 1, 1, 1, 3],
-            ['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
-            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
-            [(0, 3), (3, 5)]
-        ),
-        # Noun compound, person name and titles
-        # Dom Pedro II -> Dom Pedro II
-        (
-            ["Dom", "Pedro", "II"],
-            [0, 0, 0],
-            ["ROOT", "flat:name", "flat:name"],
-            ["PROPN", "PROPN", "PROPN"],
-            [(0,3)]
-        ),
-        # Noun compound created by flat
-        # gli Stati Uniti
-        (
-            ["gli", "Stati", "Uniti"],
-            [1, 1, 1],
-            ["det", "ROOT", "flat:name"],
-            ["DET", "PROPN", "PROPN"],
-            [(0,3)]
-        ),
-        # nmod relation between NPs
-        # la distruzione della città -> la distruzione, città
-        (
-            ['la', 'distruzione', 'della', 'città'],
-            [1, 1, 3, 1],
-            ['det', 'ROOT', 'case', 'nmod'],
-            ['DET', 'NOUN', 'ADP', 'NOUN'],
-            [(0,2), (3,4)]
-        ),
-        # Compounding by nmod, several NPs chained together
-        # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
-        (
-            ["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
-            [2, 2, 2, 4, 2, 6, 2],
-            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
-            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
-            [(0, 3), (4, 5), (6, 7)]
-        ),
-        # several NPs
-        # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
-        (
-            ['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
-            [0, 2, 0, 4, 2],
-            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
-            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
-            [(0,1), (2,3), (4,5)]  
-       
-        ),
-        # Several NPs
-        # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
-        (  
-            ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
-            [1, 1, 1, 4, 1, 8, 8, 8, 1],
-            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
-            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
-            [(0,3), (4,5), (6,9)]
-        ),
-        # Passive subject
-        # La nuova spesa è alimentata dal grande conto in banca di Clinton  -> Le nuova spesa, grande conto, banca, Clinton
-        (
-            ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
-            [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
-            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
-            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
-            [(0, 3), (6, 8), (9, 10), (11,12)]
-        ),
-        # Misc
-        # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
-        (
-            ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
-            [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
-            ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
-            ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
-            [(2,4), (9,12), (13,14), (17,18), (19,20)]
-        )
-    ],
-)
-# fmt: on
-def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
-    doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
-    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
-
-
-def test_noun_chunks_is_parsed_it(it_tokenizer):
-    """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
-    doc = it_tokenizer("Sei andato a Oxford")
-    with pytest.raises(ValueError):
-        list(doc.noun_chunks)

From 39f1b13e7729c5fa41fd28972539cc35fce9398a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 17 Jan 2022 11:48:39 +0100
Subject: [PATCH 06/16] Update sudachipy extras (#10072)

By @polm, redone from #9917 after incorrect (reverted) rebase.

`sudachipy>=0.5.2` is needed for newer dictionaries. `sudachipy<0.6.0`
is kept for users who might still prefer the older version, in
particular to be able to compile it without rust.
---
 setup.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 50e982cbf..586a044ff 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -108,8 +108,8 @@ apple =
     thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
-    sudachipy>=0.4.9
-    sudachidict_core>=20200330
+    sudachipy>=0.5.2,!=0.6.1
+    sudachidict_core>=20211220
 ko =
     natto-py==0.9.0
 th =

From c28e33637bf7c7beef8658db7bfc33182adeca87 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 18 Jan 2022 17:36:28 +0900
Subject: [PATCH 07/16] Mark flaky spancat test so it doesn't fail the build
 (#10075)

* Mark flaky spancat test so it doesn't fail the build

* Skip, don't run and ignore
---
 spacy/tests/pipeline/test_spancat.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 2f7e952d3..39d2e97da 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -79,7 +79,8 @@ def test_explicit_labels():
     nlp.initialize()
     assert spancat.labels == ("PERSON", "LOC")
 
-
+#TODO figure out why this is flaky
+@pytest.mark.skip(reason="Test is unreliable for unknown reason")
 def test_doc_gc():
     # If the Doc object is garbage collected, the spans won't be functional afterwards
     nlp = Language()
@@ -97,6 +98,7 @@ def test_doc_gc():
         assert isinstance(spangroups, SpanGroups)
         for key, spangroup in spangroups.items():
             assert isinstance(spangroup, SpanGroup)
+            # XXX This fails with length 0 sometimes
             assert len(spangroup) > 0
             with pytest.raises(RuntimeError):
                 span = spangroup[0]

From 4dfd559e5569f73846d5280d86487104f8550b0d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 18 Jan 2022 17:12:42 +0100
Subject: [PATCH 08/16] Fix spaces in Doc.from_docs for empty docs (#10052)

Fix spaces in `Doc.from_docs(ensure_whitespace=True)` for cases where an
doc ending in whitespace is followed by an empty doc.
---
 spacy/tests/doc/test_doc_api.py | 5 +++--
 spacy/tokens/doc.pyx            | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index c6195d7e2..10700b787 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
         "Merging the docs is fun.",
         "",
         "They don't think alike. ",
+        "",
         "Another doc.",
     ]
     en_texts_without_empty = [t for t in en_texts if len(t)]
@@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     en_docs = [en_tokenizer(text) for text in en_texts]
     en_docs[0].spans["group"] = [en_docs[0][1:4]]
     en_docs[2].spans["group"] = [en_docs[2][1:4]]
-    en_docs[3].spans["group"] = [en_docs[3][0:1]]
+    en_docs[4].spans["group"] = [en_docs[4][0:1]]
     span_group_texts = sorted(
-        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
+        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
     )
     de_doc = de_tokenizer(de_text)
     Token.set_extension("is_ambiguous", default=False)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 362a17784..2f82a0d1b 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1183,7 +1183,7 @@ cdef class Doc:
                 token_offset = -1
                 for doc in docs[:-1]:
                     token_offset += len(doc)
-                    if not (len(doc) > 0 and doc[-1].is_space):
+                    if len(doc) > 0 and not doc[-1].is_space:
                         concat_spaces[token_offset] = True
 
         concat_array = numpy.concatenate(arrays)

From 50d2a2c93071f4d96606ba0d5985c54b59184cbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 18 Jan 2022 17:14:35 +0100
Subject: [PATCH 09/16] User fewer Vector internals (#9879)

* Use Vectors.shape rather than Vectors.data.shape

* Use Vectors.size rather than Vectors.data.size

* Add Vectors.to_ops to move data between different ops

* Add documentation for Vector.to_ops
---
 spacy/language.py                         |  8 ++++----
 spacy/ml/models/multi_task.py             |  4 ++--
 spacy/ml/staticvectors.py                 |  2 +-
 spacy/tests/vocab_vectors/test_vectors.py | 10 +++++-----
 spacy/tokens/doc.pyx                      |  4 ++--
 spacy/tokens/span.pyx                     |  2 +-
 spacy/training/initialize.py              |  2 +-
 spacy/vectors.pyx                         |  7 +++++--
 spacy/vocab.pyx                           |  4 ++--
 website/docs/api/vectors.md               | 17 +++++++++++++++++
 10 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 638616316..798254b80 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1285,9 +1285,9 @@ class Language:
             )
         except IOError:
             raise IOError(Errors.E884.format(vectors=I["vectors"]))
-        if self.vocab.vectors.data.shape[1] >= 1:
+        if self.vocab.vectors.shape[1] >= 1:
             ops = get_current_ops()
-            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+            self.vocab.vectors.to_ops(ops)
         if hasattr(self.tokenizer, "initialize"):
             tok_settings = validate_init_settings(
                 self.tokenizer.initialize,  # type: ignore[union-attr]
@@ -1332,8 +1332,8 @@ class Language:
         DOCS: https://spacy.io/api/language#resume_training
         """
         ops = get_current_ops()
-        if self.vocab.vectors.data.shape[1] >= 1:
-            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+        if self.vocab.vectors.shape[1] >= 1:
+            self.vocab.vectors.to_ops(ops)
         for name, proc in self.pipeline:
             if hasattr(proc, "_rehearsal_model"):
                 proc._rehearsal_model = deepcopy(proc.model)  # type: ignore[attr-defined]
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 37473b7f4..9e1face63 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -23,7 +23,7 @@ def create_pretrain_vectors(
     maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
     def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
-        if vocab.vectors.data.shape[1] == 0:
+        if vocab.vectors.shape[1] == 0:
             raise ValueError(Errors.E875)
         model = build_cloze_multi_task_model(
             vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
@@ -116,7 +116,7 @@ def build_multi_task_model(
 def build_cloze_multi_task_model(
     vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
 ) -> Model:
-    nO = vocab.vectors.data.shape[1]
+    nO = vocab.vectors.shape[1]
     output_layer = chain(
         cast(Model[List["Floats2d"], Floats2d], list2array()),
         Maxout(
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 8dd65833b..8d9b1af9b 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -94,7 +94,7 @@ def init(
     nM = model.get_dim("nM") if model.has_dim("nM") else None
     nO = model.get_dim("nO") if model.has_dim("nO") else None
     if X is not None and len(X):
-        nM = X[0].vocab.vectors.data.shape[1]
+        nM = X[0].vocab.vectors.shape[1]
     if Y is not None:
         nO = Y.data.shape[1]
 
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 9dc40b499..0650a7487 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -421,7 +421,7 @@ def test_vector_is_oov():
 def test_init_vectors_unset():
     v = Vectors(shape=(10, 10))
     assert v.is_full is False
-    assert v.data.shape == (10, 10)
+    assert v.shape == (10, 10)
 
     with pytest.raises(ValueError):
         v = Vectors(shape=(10, 10), mode="floret")
@@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
     # rows: 2 rows per ngram
     rows = OPS.xp.asarray(
         [
-            h % nlp.vocab.vectors.data.shape[0]
+            h % nlp.vocab.vectors.shape[0]
             for ngram in ngrams
             for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
         ],
@@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
     # an empty key returns 0s
     assert_equal(
         OPS.to_numpy(nlp.vocab[""].vector),
-        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+        numpy.zeros((nlp.vocab.vectors.shape[0],)),
     )
     # an empty batch returns 0s
     assert_equal(
         OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
-        numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
+        numpy.zeros((1, nlp.vocab.vectors.shape[0])),
     )
     # an empty key within a batch returns 0s
     assert_equal(
         OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
-        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+        numpy.zeros((nlp.vocab.vectors.shape[0],)),
     )
 
     # the loaded ngram vector table cannot be modified
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2f82a0d1b..5a0db115d 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -616,7 +616,7 @@ cdef class Doc:
         """
         if "has_vector" in self.user_hooks:
             return self.user_hooks["has_vector"](self)
-        elif self.vocab.vectors.data.size:
+        elif self.vocab.vectors.size:
             return True
         elif self.tensor.size:
             return True
@@ -641,7 +641,7 @@ cdef class Doc:
             if not len(self):
                 self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
                 return self._vector
-            elif self.vocab.vectors.data.size > 0:
+            elif self.vocab.vectors.size > 0:
                 self._vector = sum(t.vector for t in self) / len(self)
                 return self._vector
             elif self.tensor.size > 0:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index cd02cab36..9bb6bf2e7 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -497,7 +497,7 @@ cdef class Span:
         """
         if "has_vector" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["has_vector"](self)
-        elif self.vocab.vectors.data.size > 0:
+        elif self.vocab.vectors.size > 0:
             return any(token.has_vector for token in self)
         elif self.doc.tensor.size > 0:
             return True
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 084204389..b59288e38 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -164,7 +164,7 @@ def load_vectors_into_model(
         len(vectors_nlp.vocab.vectors.keys()) == 0
         and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
     ) or (
-        vectors_nlp.vocab.vectors.data.shape[0] == 0
+        vectors_nlp.vocab.vectors.shape[0] == 0
         and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
     ):
         logger.warning(Warnings.W112.format(name=name))
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 345e8df68..bc4863703 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -10,7 +10,7 @@ from typing import cast
 import warnings
 from enum import Enum
 import srsly
-from thinc.api import get_array_module, get_current_ops
+from thinc.api import Ops, get_array_module, get_current_ops
 from thinc.backends import get_array_ops
 from thinc.types import Floats2d
 
@@ -146,7 +146,7 @@ cdef class Vectors:
 
         DOCS: https://spacy.io/api/vectors#size
         """
-        return self.data.shape[0] * self.data.shape[1]
+        return self.data.size
 
     @property
     def is_full(self):
@@ -517,6 +517,9 @@ cdef class Vectors:
                     for i in range(len(queries)) ], dtype="uint64")
         return (keys, best_rows, scores)
 
+    def to_ops(self, ops: Ops):
+        self.data = ops.asarray(self.data)
+
     def _get_cfg(self):
         if self.mode == Mode.default:
             return {
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index e2e7ad1db..badd291ed 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -283,7 +283,7 @@ cdef class Vocab:
 
     @property
     def vectors_length(self):
-        return self.vectors.data.shape[1]
+        return self.vectors.shape[1]
 
     def reset_vectors(self, *, width=None, shape=None):
         """Drop the current vector table. Because all vectors must be the same
@@ -294,7 +294,7 @@ cdef class Vocab:
         elif shape is not None:
             self.vectors = Vectors(strings=self.strings, shape=shape)
         else:
-            width = width if width is not None else self.vectors.data.shape[1]
+            width = width if width is not None else self.vectors.shape[1]
             self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
 
     def prune_vectors(self, nr_row, batch_size=1024):
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index 84d2c00ad..b3bee822c 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
 | ------ | --------------------------------------- |
 | `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
 
+## Vectors.to_ops {#to_ops tag="method"}
+
+Change the embedding matrix to use different Thinc ops.
+
+> #### Example
+>
+> ```python
+> from thinc.api import NumpyOps
+>
+> vectors.to_ops(NumpyOps())
+>
+> ```
+
+| Name  | Description                                              |
+|-------|----------------------------------------------------------|
+| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
+
 ## Vectors.to_disk {#to_disk tag="method"}
 
 Save the current state to a directory.

From 2ff53834bb09eea2af3b7715a2516bcf7913a370 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 19 Jan 2022 18:45:11 +0900
Subject: [PATCH 10/16] Add link to pattern file info in EntityRuler.initialize
 docs (#10091)

* Add link to pattern file info in EntityRuler.initialize docs

* Update website/docs/api/entityruler.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/docs/api/entityruler.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 6d8f835bf..1ef283870 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -99,9 +99,9 @@ be a token pattern (list) or a phrase pattern (string). For example:
 ## EntityRuler.initialize {#initialize tag="method" new="3"}
 
 Initialize the component with data and used before training to load in rules
-from a file. This method is typically called by
-[`Language.initialize`](/api/language#initialize) and lets you customize
-arguments it receives via the
+from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method
+is typically called by [`Language.initialize`](/api/language#initialize) and
+lets you customize arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 

From 7d528e607c0c6cd267d42b2ea36e96bc25e7bd80 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jan 2022 10:53:40 +0100
Subject: [PATCH 11/16] Update quickstart install steps (#10092)

* For conda:
  * Use conda environment rather than venv
  * Install `spacy-transformers` as a conda package
* For pip:
  * Add quotes if extras are included
---
 website/src/widgets/quickstart-install.js | 42 +++++++++++++++++------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 628e1c533..1c8ad19da 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -113,8 +113,7 @@ const QuickstartInstall = ({ id, title }) => {
                             {
                                 id: 'venv',
                                 title: 'virtual env',
-                                help:
-                                    'Use a virtual environment and install spaCy into a user directory',
+                                help: 'Use a virtual environment',
                             },
                             {
                                 id: 'train',
@@ -165,27 +164,51 @@ const QuickstartInstall = ({ id, title }) => {
                         setters={setters}
                         showDropdown={showDropdown}
                     >
-                        <QS config="venv">python -m venv .env</QS>
-                        <QS config="venv" os="mac">
+                        <QS package="pip" config="venv">
+                            python -m venv .env
+                        </QS>
+                        <QS package="pip" config="venv" os="mac">
                             source .env/bin/activate
                         </QS>
-                        <QS config="venv" os="linux">
+                        <QS package="pip" config="venv" os="linux">
                             source .env/bin/activate
                         </QS>
-                        <QS config="venv" os="windows">
+                        <QS package="pip" config="venv" os="windows">
                             .env\Scripts\activate
                         </QS>
+                        <QS package="source" config="venv">
+                            python -m venv .env
+                        </QS>
+                        <QS package="source" config="venv" os="mac">
+                            source .env/bin/activate
+                        </QS>
+                        <QS package="source" config="venv" os="linux">
+                            source .env/bin/activate
+                        </QS>
+                        <QS package="source" config="venv" os="windows">
+                            .env\Scripts\activate
+                        </QS>
+                        <QS package="conda" config="venv">
+                            conda create -n venv
+                        </QS>
+                        <QS package="conda" config="venv">
+                            conda activate venv
+                        </QS>
                         <QS package="pip">pip install -U pip setuptools wheel</QS>
                         <QS package="source">pip install -U pip setuptools wheel</QS>
                         <QS package="pip">
-                            pip install -U {pkg}
-                            {pipExtras && `[${pipExtras}]`}
+                            {pipExtras
+                                ? `pip install -U '${pkg}[${pipExtras}]'`
+                                : `pip install -U ${pkg}`}
                             {nightly ? ' --pre' : ''}
                         </QS>
                         <QS package="conda">conda install -c conda-forge spacy</QS>
                         <QS package="conda" hardware="gpu">
                             conda install -c conda-forge cupy
                         </QS>
+                        <QS package="conda" config="train">
+                            conda install -c conda-forge spacy-transformers
+                        </QS>
                         <QS package="source">
                             git clone https://github.com/{repo}
                             {nightly ? ` --branch ${DEFAULT_BRANCH}` : ''}
@@ -205,9 +228,6 @@ const QuickstartInstall = ({ id, title }) => {
                         <QS config="train" package="conda" comment prompt={false}>
                             # packages only available via pip
                         </QS>
-                        <QS config="train" package="conda">
-                            pip install spacy-transformers
-                        </QS>
                         <QS config="train" package="conda">
                             pip install spacy-lookups-data
                         </QS>

From e9c631453968288f224a1ab5861bf59a9c109f63 Mon Sep 17 00:00:00 2001
From: Richard Hudson <richard@explosion.ai>
Date: Thu, 20 Jan 2022 11:40:46 +0100
Subject: [PATCH 12/16] Bugfix for similarity return types (#10051)

---
 spacy/lexeme.pyx                             |  6 ++--
 spacy/tests/vocab_vectors/test_similarity.py | 34 ++++++++++++++++----
 spacy/tokens/span.pyx                        |  6 ++--
 spacy/tokens/token.pyx                       |  6 ++--
 4 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 792e405dd..6c66effde 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -130,8 +130,10 @@ cdef class Lexeme:
             return 0.0
         vector = self.vector
         xp = get_array_module(vector)
-        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
-
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
+    
     @property
     def has_vector(self):
         """RETURNS (bool): Whether a word vector is associated with the object.
diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py
index 3b9308f4d..47cd1f060 100644
--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
     assert lex1.vector_norm != 0
     assert lex2.vector_norm != 0
     assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
+    assert isinstance(lex1.similarity(lex2), float)
     assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
     assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
 
@@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
     assert doc[0].vector_norm != 0
     assert doc[1].vector_norm != 0
     assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
+    assert isinstance(doc[0].similarity(doc[1]), float)
     assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
     assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
 
 
+def test_vectors_similarity_SS(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc = Doc(vocab, words=[word1, word2])
+    assert isinstance(doc[0:1].similarity(doc[0:2]), float)
+    assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
+
+
+def test_vectors_similarity_DD(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc1 = Doc(vocab, words=[word1, word2])
+    doc2 = Doc(vocab, words=[word2, word1])
+    assert isinstance(doc1.similarity(doc2), float)
+    assert doc1.similarity(doc2) == doc2.similarity(doc1)
+
+
 def test_vectors_similarity_TD(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     doc = Doc(vocab, words=[word1, word2])
     with pytest.warns(UserWarning):
+        assert isinstance(doc.similarity(doc[0]), float)
+        assert isinstance(doc[0].similarity(doc), float)
         assert doc.similarity(doc[0]) == doc[0].similarity(doc)
 
 
-def test_vectors_similarity_DS(vocab, vectors):
-    [(word1, vec1), (word2, vec2)] = vectors
-    doc = Doc(vocab, words=[word1, word2])
-    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
-
-
 def test_vectors_similarity_TS(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     doc = Doc(vocab, words=[word1, word2])
     with pytest.warns(UserWarning):
+        assert isinstance(doc[:2].similarity(doc[0]), float)
+        assert isinstance(doc[0].similarity(doc[-2]), float)
         assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
+
+
+def test_vectors_similarity_DS(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc = Doc(vocab, words=[word1, word2])
+    assert isinstance(doc.similarity(doc[:2]), float)
+    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 9bb6bf2e7..f7ddc5136 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -364,8 +364,10 @@ cdef class Span:
             return 0.0
         vector = self.vector
         xp = get_array_module(vector)
-        return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
-
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
+    
     cpdef np.ndarray to_array(self, object py_attr_ids):
         """Given a list of M attribute IDs, export the tokens to a numpy
         `ndarray` of shape `(N, M)`, where `N` is the length of the document.
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index aa97e2b07..c09ec28d6 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -209,8 +209,10 @@ cdef class Token:
             return 0.0
         vector = self.vector
         xp = get_array_module(vector)
-        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
-
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
+    
     def has_morph(self):
         """Check whether the token has annotated morph information.
         Return False when the morph annotation is unset/missing.

From a55212fca01f97beaf6f07e8ff3fc6e81a0b7de4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jan 2022 11:42:52 +0100
Subject: [PATCH 13/16] Determine labels by factory name in debug data (#10079)

* Determine labels by factory name in debug data

For all components, return labels for all components with the
corresponding factory name rather than for only the default name.

For `spancat`, return labels as a dict keyed by `spans_key`.

* Refactor for typing

* Add test

* Use assert instead of cast, removed unneeded arg

* Mark test as slow
---
 spacy/cli/debug_data.py | 38 ++++++++++++++++++++++++++++++++------
 spacy/tests/test_cli.py | 27 +++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 688b07a9b..b9831fe0c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
 from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..pipeline._parser_internals.nonproj import DELIMITER
-from ..pipeline import Morphologizer
+from ..pipeline import Morphologizer, SpanCategorizer
 from ..morphology import Morphology
 from ..language import Language
 from ..util import registry, resolve_dot_names
@@ -699,8 +699,34 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
     return count
 
 
-def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
-    if pipe_name not in nlp.pipe_names:
-        return set()
-    pipe = nlp.get_pipe(pipe_name)
-    return set(pipe.labels)
+def _get_labels_from_model(
+    nlp: Language, factory_name: str
+) -> Set[str]:
+    pipe_names = [
+        pipe_name
+        for pipe_name in nlp.pipe_names
+        if nlp.get_pipe_meta(pipe_name).factory == factory_name
+    ]
+    labels: Set[str] = set()
+    for pipe_name in pipe_names:
+        pipe = nlp.get_pipe(pipe_name)
+        labels.update(pipe.labels)
+    return labels
+
+
+def _get_labels_from_spancat(
+    nlp: Language
+) -> Dict[str, Set[str]]:
+    pipe_names = [
+        pipe_name
+        for pipe_name in nlp.pipe_names
+        if nlp.get_pipe_meta(pipe_name).factory == "spancat"
+    ]
+    labels: Dict[str, Set[str]] = {}
+    for pipe_name in pipe_names:
+        pipe = nlp.get_pipe(pipe_name)
+        assert isinstance(pipe, SpanCategorizer)
+        if pipe.key not in labels:
+            labels[pipe.key] = set()
+        labels[pipe.key].update(pipe.labels)
+    return labels
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index b0862eab6..253469909 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -12,6 +12,8 @@ from spacy.cli._util import is_subpath_of, load_project_config
 from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
+from spacy.cli.debug_data import _get_labels_from_model
+from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
 from spacy.cli.package import get_third_party_dependencies
@@ -665,3 +667,28 @@ def test_get_third_party_dependencies():
 )
 def test_is_subpath_of(parent, child, expected):
     assert is_subpath_of(parent, child) == expected
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "factory_name,pipe_name",
+    [
+        ("ner", "ner"),
+        ("ner", "my_ner"),
+        ("spancat", "spancat"),
+        ("spancat", "my_spancat"),
+    ],
+)
+def test_get_labels_from_model(factory_name, pipe_name):
+    labels = ("A", "B")
+
+    nlp = English()
+    pipe = nlp.add_pipe(factory_name, name=pipe_name)
+    for label in labels:
+        pipe.add_label(label)
+    nlp.initialize()
+    assert nlp.get_pipe(pipe_name).labels == labels
+    if factory_name == "spancat":
+        assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
+    else:
+        assert _get_labels_from_model(nlp, factory_name) == set(labels)

From 32bd3856b3b8fe749b77dca7d755366eaa87a2fd Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 20 Jan 2022 20:00:28 +0900
Subject: [PATCH 14/16] Rename FACILITY to FAC in color list (#10067)

This matches the English models
---
 spacy/displacy/render.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 14d741a3d..a032d843b 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
     "LOC": "#ff9561",
     "PERSON": "#aa9cfc",
     "NORP": "#c887fb",
-    "FACILITY": "#9cc9cc",
+    "FAC": "#9cc9cc",
     "EVENT": "#ffeb80",
     "LAW": "#ff8197",
     "LANGUAGE": "#ff8197",

From 268ddf8a0611b86ca84ddd0a36a5ead0d177d1f1 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Thu, 20 Jan 2022 13:18:39 +0100
Subject: [PATCH 15/16] Add ENT_IOB key to Matcher (#9649)

* added new field

* added exception for IOb strings

* minor refinement to schema

* removed field

* fixed typo

* imported numeriacla val

* changed the code bit

* cosmetics

* added test for matcher

* set ents of moc docs

* added invalid pattern

* minor update to documentation

* blacked matcher

* added pattern validation

* add IOB vals to schema

* changed into test

* mypy compat

* cleaned left over

* added compat import

* changed type

* added compat import

* changed literal a bit

* went back to old

* made explicit type

* Update spacy/schemas.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update spacy/schemas.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update spacy/schemas.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/matcher/matcher.pyx                     |  7 +++--
 spacy/schemas.py                              |  3 +++
 spacy/tests/matcher/test_matcher_api.py       | 27 +++++++++++++++++++
 .../tests/matcher/test_pattern_validation.py  |  1 +
 website/docs/api/matcher.md                   |  1 +
 5 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 745d7cf43..6aa58f0e3 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
 
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@@ -798,7 +798,10 @@ def _get_attr_values(spec, string_store):
                 attr = "SENT_START"
             attr = IDS.get(attr)
         if isinstance(value, str):
-            value = string_store.add(value)
+            if attr == ENT_IOB and value in Token.iob_strings():
+                value = Token.iob_strings().index(value)
+            else:
+                value = string_store.add(value)
         elif isinstance(value, bool):
             value = int(value)
         elif isinstance(value, int):
diff --git a/spacy/schemas.py b/spacy/schemas.py
index cf58688ef..1dfd8ee85 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,5 +1,6 @@
 from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
 from typing import Iterable, TypeVar, TYPE_CHECKING
+from .compat import Literal
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator, create_model
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
@@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
 UnderscoreValue = Union[
     TokenPatternString, TokenPatternNumber, str, int, float, list, bool
 ]
+IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
 
 
 class TokenPattern(BaseModel):
@@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
     lemma: Optional[StringValue] = None
     shape: Optional[StringValue] = None
     ent_type: Optional[StringValue] = None
+    ent_iob: Optional[IobValue] = None
     ent_id: Optional[StringValue] = None
     ent_kb_id: Optional[StringValue] = None
     norm: Optional[StringValue] = None
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index c02d65cdf..a27baf130 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
     matcher = Matcher(en_vocab)
     matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
     assert len(matcher(doc)) == 0
+
+
+def test_matcher_ent_iob_key(en_vocab):
+    """Test that patterns with ent_iob works correctly."""
+    matcher = Matcher(en_vocab)
+    matcher.add("Rule", [[{"ENT_IOB": "I"}]])
+    doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
+    doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
+    doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
+    doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
+    matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
+    matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
+    assert len(matches1) == 1
+    assert matches1[0] == "York"
+    assert len(matches2) == 0
+
+    matcher = Matcher(en_vocab)  # Test iob pattern with operators
+    matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
+    doc = Doc(
+        en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
+    )
+    doc.ents = [Span(doc, 4, 7, label="PERSON")]
+    matches = [doc[start:end].text for _, start, end in matcher(doc)]
+    assert len(matches) == 3
+    assert matches[0] == "Maria"
+    assert matches[1] == "Maria Esperanza"
+    assert matches[2] == "Esperanza"
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index 74feb7c5d..8c265785c 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -12,6 +12,7 @@ TEST_PATTERNS = [
     ([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
     ([{"_": "foo"}], 1, 1),
     ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
+    ([{"ENT_IOB": "foo"}], 1, 1),
     ([1, 2, 3], 3, 1),
     # Bad patterns flagged outside of Matcher
     ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0),  # prev: (1, 0)
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 803105ba2..3e7f9dc04 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -44,6 +44,7 @@ rule-based matching are:
 | `SPACY`                                         | Token has a trailing space. ~~bool~~                                                                                      |
 |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
 | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `ENT_IOB`                                       | The IOB part of the token's entity tag. ~~str~~                                                                           |
 | `ENT_ID`                                        | The token's entity ID (`ent_id`). ~~str~~                                                                                 |
 | `ENT_KB_ID`                                     | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~                                                               |
 | `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |

From 47a29168013cf077896d784344c00ac230642207 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Thu, 20 Jan 2022 13:19:38 +0100
Subject: [PATCH 16/16] Intify IOB (#9738)

* added iob to int

* added tests

* added iob strings

* added error

* blacked attrs

* Update spacy/tests/lang/test_attrs.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update spacy/attrs.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* added iob strings as global

* minor refinement with iob

* removed iob strings from token

* changed to uppercase

* cleaned and went back to master version

* imported iob from attrs

* Update and format errors

* Support and test both str and int ENT_IOB key

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/attrs.pyx                | 88 +++++++++++++++++++++++++---------
 spacy/errors.py                |  9 ++--
 spacy/tests/lang/test_attrs.py | 33 +++++++++++++
 spacy/tokens/token.pyx         |  3 +-
 4 files changed, 107 insertions(+), 26 deletions(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 640fb2f3c..dc8eed7c3 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -1,3 +1,6 @@
+from .errors import Errors
+
+IOB_STRINGS = ("", "I", "O", "B")
 
 IDS = {
     "": NULL_ATTR,
@@ -64,7 +67,6 @@ IDS = {
     "FLAG61": FLAG61,
     "FLAG62": FLAG62,
     "FLAG63": FLAG63,
-
     "ID": ID,
     "ORTH": ORTH,
     "LOWER": LOWER,
@@ -72,7 +74,6 @@ IDS = {
     "SHAPE": SHAPE,
     "PREFIX": PREFIX,
     "SUFFIX": SUFFIX,
-
     "LENGTH": LENGTH,
     "LEMMA": LEMMA,
     "POS": POS,
@@ -87,7 +88,7 @@ IDS = {
     "SPACY": SPACY,
     "LANG": LANG,
     "MORPH": MORPH,
-    "IDX": IDX
+    "IDX": IDX,
 }
 
 
@@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
     """
     inty_attrs = {}
     if _do_deprecated:
-        if 'F' in stringy_attrs:
+        if "F" in stringy_attrs:
             stringy_attrs["ORTH"] = stringy_attrs.pop("F")
-        if 'L' in stringy_attrs:
+        if "L" in stringy_attrs:
             stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
-        if 'pos' in stringy_attrs:
+        if "pos" in stringy_attrs:
             stringy_attrs["TAG"] = stringy_attrs.pop("pos")
-        if 'morph' in stringy_attrs:
-            morphs = stringy_attrs.pop('morph')
-        if 'number' in stringy_attrs:
-            stringy_attrs.pop('number')
-        if 'tenspect' in stringy_attrs:
-            stringy_attrs.pop('tenspect')
+        if "morph" in stringy_attrs:
+            morphs = stringy_attrs.pop("morph")
+        if "number" in stringy_attrs:
+            stringy_attrs.pop("number")
+        if "tenspect" in stringy_attrs:
+            stringy_attrs.pop("tenspect")
         morph_keys = [
-            'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
-            'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
-            'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
-            'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
-            'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
-            'NumValue', 'PartType', 'Polite', 'StyleVariant',
-            'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
-            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
-            'Polarity', 'PrepCase', 'Animacy' # U20
+            "PunctType",
+            "PunctSide",
+            "Other",
+            "Degree",
+            "AdvType",
+            "Number",
+            "VerbForm",
+            "PronType",
+            "Aspect",
+            "Tense",
+            "PartType",
+            "Poss",
+            "Hyph",
+            "ConjType",
+            "NumType",
+            "Foreign",
+            "VerbType",
+            "NounType",
+            "Gender",
+            "Mood",
+            "Negative",
+            "Tense",
+            "Voice",
+            "Abbr",
+            "Derivation",
+            "Echo",
+            "Foreign",
+            "NameType",
+            "NounType",
+            "NumForm",
+            "NumValue",
+            "PartType",
+            "Polite",
+            "StyleVariant",
+            "PronType",
+            "AdjType",
+            "Person",
+            "Variant",
+            "AdpType",
+            "Reflex",
+            "Negative",
+            "Mood",
+            "Aspect",
+            "Case",
+            "Polarity",
+            "PrepCase",
+            "Animacy",  # U20
         ]
         for key in morph_keys:
             if key in stringy_attrs:
@@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
     for name, value in stringy_attrs.items():
         int_key = intify_attr(name)
         if int_key is not None:
+            if int_key == ENT_IOB:
+                if value in IOB_STRINGS:
+                    value = IOB_STRINGS.index(value)
+                elif isinstance(value, str):
+                    raise ValueError(Errors.E1025.format(value=value))
             if strings_map is not None and isinstance(value, str):
-                if hasattr(strings_map, 'add'):
+                if hasattr(strings_map, "add"):
                     value = strings_map.add(value)
                 else:
                     value = strings_map[value]
diff --git a/spacy/errors.py b/spacy/errors.py
index 673674222..390612123 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -888,11 +888,14 @@ class Errors(metaclass=ErrorsWithCodes):
     E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
              "Non-UD tags should use the `tag` property.")
     E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
-    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
-    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
+    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
+             "exist.")
+    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
+             "patterns.")
+    E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
+             "supported values are: 'I', 'O', 'B' and ''")
     
 
-
 # Deprecated model shortcuts, only used in errors and warnings
 OLD_MODEL_SHORTCUTS = {
     "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py
index 5350c1fe5..1c27c1744 100644
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@@ -1,4 +1,5 @@
 import pytest
+from spacy.attrs import intify_attrs, ENT_IOB
 
 from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
 from spacy.lang.en.stop_words import STOP_WORDS
@@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
     assert int_attrs == {ORTH: 10, IS_ALPHA: True}
 
 
+def test_attrs_ent_iob_intify():
+    int_attrs = intify_attrs({"ENT_IOB": ""})
+    assert int_attrs == {ENT_IOB: 0}
+
+    int_attrs = intify_attrs({"ENT_IOB": "I"})
+    assert int_attrs == {ENT_IOB: 1}
+
+    int_attrs = intify_attrs({"ENT_IOB": "O"})
+    assert int_attrs == {ENT_IOB: 2}
+
+    int_attrs = intify_attrs({"ENT_IOB": "B"})
+    assert int_attrs == {ENT_IOB: 3}
+
+    int_attrs = intify_attrs({ENT_IOB: ""})
+    assert int_attrs == {ENT_IOB: 0}
+
+    int_attrs = intify_attrs({ENT_IOB: "I"})
+    assert int_attrs == {ENT_IOB: 1}
+
+    int_attrs = intify_attrs({ENT_IOB: "O"})
+    assert int_attrs == {ENT_IOB: 2}
+
+    int_attrs = intify_attrs({ENT_IOB: "B"})
+    assert int_attrs == {ENT_IOB: 3}
+
+    with pytest.raises(ValueError):
+        int_attrs = intify_attrs({"ENT_IOB": "XX"})
+
+    with pytest.raises(ValueError):
+        int_attrs = intify_attrs({ENT_IOB: "XX"})
+
+
 @pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
 def test_lex_attrs_is_punct(text, match):
     assert is_punct(text) == match
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index c09ec28d6..b515ab67b 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
 
 from .. import parts_of_speech
 from ..errors import Errors, Warnings
+from ..attrs import IOB_STRINGS
 from .underscore import Underscore, get_ext_args
 
 
@@ -745,7 +746,7 @@ cdef class Token:
 
     @classmethod
     def iob_strings(cls):
-        return ("", "I", "O", "B")
+        return IOB_STRINGS
 
     @property
     def ent_iob_(self):