Merge pull request #10309 from svlandeg/copy/develop

Update master with latest from develop
2025-08-08 22:24:55 +03:00 · 2022-02-16 15:40:58 +01:00 · 2022-02-16 15:40:58 +01:00 · 228aaa16b7
commit 228aaa16b7
parent d30ee14ab3 a16b14e591
29 changed files with 1230 additions and 282 deletions
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
 _hangul_jamo = r"\u1100-\u11FF"
 _hangul = _hangul_syllables + _hangul_jamo

+_hiragana = r"\u3040-\u309F"
+_katakana = r"\u30A0-\u30FFー"
+_kana = _hiragana + _katakana
+
 # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
 _latin_u_extendedA = (
    r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@ -244,6 +248,7 @@ _uncased = (
    + _tamil
    + _telugu
    + _hangul
+    + _kana
    + _cjk
 )

--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@ -47,6 +47,41 @@ _num_words = [
 ]


+_ordinal_words = [
+    "primero",
+    "segundo",
+    "tercero",
+    "cuarto",
+    "quinto",
+    "sexto",
+    "séptimo",
+    "octavo",
+    "noveno",
+    "décimo",
+    "undécimo",
+    "duodécimo",
+    "decimotercero",
+    "decimocuarto",
+    "decimoquinto",
+    "decimosexto",
+    "decimoséptimo",
+    "decimoctavo",
+    "decimonoveno",
+    "vigésimo",
+    "trigésimo",
+    "cuadragésimo",
+    "quincuagésimo",
+    "sexagésimo",
+    "septuagésimo",
+    "octogésima",
+    "nonagésima",
+    "centésima",
+    "milésima",
+    "millonésima",
+    "billonésima",
+]
+
+
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
@ -57,7 +92,11 @@ def like_num(text):
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
-    if text.lower() in _num_words:
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+    # Check ordinal number
+    if text_lower in _ordinal_words:
        return True
    return False

--- a/spacy/lang/fi/init.py
+++ b/spacy/lang/fi/init.py
@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language, BaseDefaults


@ -11,6 +12,7 @@ class FinnishDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
+    syntax_iterators = SYNTAX_ITERATORS


 class Finnish(Language):
--- a/spacy/lang/fi/syntax_iterators.py
+++ b/spacy/lang/fi/syntax_iterators.py
@ -0,0 +1,79 @@
+from typing import Iterator, Tuple, Union
+from ...tokens import Doc, Span
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """Detect base noun phrases from a dependency parse. Works on both Doc and Span."""
+    labels = [
+        "appos",
+        "nsubj",
+        "nsubj:cop",
+        "obj",
+        "obl",
+        "ROOT",
+    ]
+    extend_labels = [
+        "amod",
+        "compound",
+        "compound:nn",
+        "flat:name",
+        "nmod",
+        "nmod:gobj",
+        "nmod:gsubj",
+        "nmod:poss",
+        "nummod",
+    ]
+
+    def potential_np_head(word):
+        return word.pos in (NOUN, PROPN) and (
+            word.dep in np_deps or word.head.pos == PRON
+        )
+
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    extend_deps = [doc.vocab.strings[label] for label in extend_labels]
+    np_label = doc.vocab.strings.add("NP")
+    conj_label = doc.vocab.strings.add("conj")
+
+    rbracket = 0
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if i < rbracket:
+            continue
+
+        # Is this a potential independent NP head or coordinated with
+        # a NOUN that is itself an independent NP head?
+        #
+        # e.g. "Terveyden ja hyvinvoinnin laitos"
+        if potential_np_head(word) or (
+            word.dep == conj_label and potential_np_head(word.head)
+        ):
+            # Try to extend to the left to include adjective/num
+            # modifiers, compound words etc.
+            lbracket = word.i
+            for ldep in word.lefts:
+                if ldep.dep in extend_deps:
+                    lbracket = ldep.left_edge.i
+                    break
+
+            # Prevent nested chunks from being produced
+            if lbracket <= prev_end:
+                continue
+
+            rbracket = word.i
+            # Try to extend the span to the right to capture
+            # appositions and noun modifiers
+            for rdep in word.rights:
+                if rdep.dep in extend_deps:
+                    rbracket = rdep.i
+            prev_end = rbracket
+
+            yield lbracket, rbracket + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@ -6,16 +6,35 @@ from ...tokens import Doc, Span


 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
-    # fmt: off
-    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
-    # fmt: on
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "obl:agent",
+        "obl:arg",
+        "obl:mod",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
    doc = doclike.doc  # Ensure works on both Doc and Span.
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
-    np_deps = [doc.vocab.strings[label] for label in labels]
-    conj = doc.vocab.strings.add("conj")
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
    np_label = doc.vocab.strings.add("NP")
+    adj_label = doc.vocab.strings.add("amod")
+    det_label = doc.vocab.strings.add("det")
+    det_pos = doc.vocab.strings.add("DET")
+    adp_pos = doc.vocab.strings.add("ADP")
+    conj_label = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
    prev_end = -1
    for i, word in enumerate(doclike):
        if word.pos not in (NOUN, PROPN, PRON):
@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
        if word.left_edge.i <= prev_end:
            continue
        if word.dep in np_deps:
-            prev_end = word.right_edge.i
-            yield word.left_edge.i, word.right_edge.i + 1, np_label
-        elif word.dep == conj:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
+
+            if right_child:
+                if (
+                    right_child.dep == adj_label
+                ):  # allow chain of adjectives by expanding to right
+                    right_end = right_child.right_edge
+                elif (
+                    right_child.dep == det_label and right_child.pos == det_pos
+                ):  # cut relative pronouns here
+                    right_end = right_child
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
+            else:
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_pos else left_index
+            )
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj_label:
            head = word.head
-            while head.dep == conj and head.head.i < head.i:
+            while head.dep == conj_label and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
-                prev_end = word.right_edge.i
-                yield word.left_edge.i, word.right_edge.i + 1, np_label
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label


 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults
 from .lemmatizer import ItalianLemmatizer
+from .syntax_iterators import SYNTAX_ITERATORS


 class ItalianDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
+    stop_words = STOP_WORDS
+    syntax_iterators = SYNTAX_ITERATORS


 class Italian(Language):
--- a/spacy/lang/it/stop_words.py
+++ b/spacy/lang/it/stop_words.py
@ -10,18 +10,18 @@ avresti avrete avrà avrò avuta avute avuti avuto

 basta bene benissimo brava bravo

-casa caso cento certa certe certi certo che chi chicchessia chiunque ci
+casa caso cento certa certe certi certo che chi chicchessia chiunque ci c'
 ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto
 cogli coi col colei coll coloro colui come cominci comunque con concernente
 conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui

-da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli
-dei del dell della delle dello dentro detto deve di dice dietro dire
+d' da dagl dagli dai dal dall dall' dalla dalle dallo dappertutto davanti degl degli
+dei del dell dell' della delle dello dentro detto deve di dice dietro dire
 dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due
 dunque durante

-ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
-erano eravamo eravate eri ero esempio esse essendo esser essere essi ex
+e ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
+erano eravamo eravate eri ero esempio esse essendo esser essere essi ex è

 fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero
 facessi facessimo faceste facesti faceva facevamo facevano facevate facevi
@ -30,21 +30,21 @@ fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente
 finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra
 frattempo fu fui fummo fuori furono futuro generale

-gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo
+gia già giacche giorni giorno gli gl' gliela gliele glieli glielo gliene governo
 grande grazie gruppo

 ha haha hai hanno ho

 ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io

-la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
+l' la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo

-ma macche magari maggior mai male malgrado malissimo mancanza marche me
+m' ma macche magari maggior mai male malgrado malissimo mancanza marche me
 medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi
 milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto

-nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun
-nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre
+nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun nessun'
+nessuna nessuno nient' niente no noi non nondimeno nonostante nonsia nostra nostre
 nostri nostro novanta nove nulla nuovo

 od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto
@ -56,12 +56,12 @@ potrebbe preferibilmente presa press prima primo principalmente probabilmente
 proprio puo può pure purtroppo

 qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante
-quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest
+quanti quanto quantunque quasi quattro quel quel' quella quelle quelli quello quest quest'
 questa queste questi questo qui quindi

 realmente recente recentemente registrazione relativo riecco salvo

-sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
+s' sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
 saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei
 sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate
 siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando
@ -72,12 +72,12 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
 subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
 sullo suo suoi

-tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
+t' tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
 troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto

-uguali ulteriore ultimo un una uno uomo
+uguali ulteriore ultimo un un' una uno uomo

-va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
+v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
 vostra vostre vostri vostro
 """.split()
 )
--- a/spacy/lang/it/syntax_iterators.py
+++ b/spacy/lang/it/syntax_iterators.py
@ -0,0 +1,86 @@
+from typing import Union, Iterator, Tuple
+
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "obl:agent",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
+    dets = ["det", "det:poss"]
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
+    np_label = doc.vocab.strings.add("NP")
+    adj_label = doc.vocab.strings.add("amod")
+    det_labels = {doc.vocab.strings.add(det) for det in dets}
+    det_pos = doc.vocab.strings.add("DET")
+    adp_label = doc.vocab.strings.add("ADP")
+    conj = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
+        if word.dep in np_deps:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
+
+            if right_child:
+                if (
+                    right_child.dep == adj_label
+                ):  # allow chain of adjectives by expanding to right
+                    right_end = right_child.right_edge
+                elif (
+                    right_child.dep in det_labels and right_child.pos == det_pos
+                ):  # cut relative pronouns here
+                    right_end = right_child
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
+            else:
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_label else left_index
+            )
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/nb/stop_words.py
+++ b/spacy/lang/nb/stop_words.py
@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av

 bak bare bedre beste blant ble bli blir blitt bris by både

-da dag de del dem den denne der dermed det dette disse drept du
+da dag de del dem den denne der dermed det dette disse du

 eller en enn er et ett etter

-fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
+fem fikk fire fjor flere folk for fortsatt fra fram
 funnet få får fått før først første

 gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går

-ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
-hvorfor
+ha hadde ham han hans har hele helt henne hennes her hun

 i ifølge igjen ikke ingen inn

 ja jeg

 kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
-kvinner

-la laget land landet langt leder ligger like litt løpet lørdag
+la laget land landet langt leder ligger like litt løpet

-man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
-millioner minutter mot msci mye må mål måtte
+man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte

-ned neste noe noen nok norge norsk norske ntb ny nye nå når
+ned neste noe noen nok ny nye nå når

-og også om onsdag opp opplyser oslo oss over
+og også om opp opplyser oss over

-personer plass poeng politidistrikt politiet president prosent på
+personer plass poeng på

-regjeringen runde rundt russland
+runde rundt

-sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
+sa saken samme sammen samtidig satt se seg seks selv senere ser sett
 siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
-store står sverige svært så søndag
+store står svært så

-ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
-tyskland
+ta tatt tid tidligere til tilbake tillegg tok tror

-under usa ut uten utenfor
+under ut uten utenfor

 vant var ved veldig vi videre viktig vil ville viser vår være vært

--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@ -1,13 +1,10 @@
 # Source: https://github.com/stopwords-iso/stopwords-sl
-# TODO: probably needs to be tidied up – the list seems to have month names in
-# it, which shouldn't be considered stop words.
+# Removed various words that are not normally considered stop words, such as months.

 STOP_WORDS = set(
    """
 a
 ali
-april
-avgust
 b
 bi
 bil
@ -19,7 +16,6 @@ biti
 blizu
 bo
 bodo
-bojo
 bolj
 bom
 bomo
@ -37,16 +33,6 @@ da
 daleč
 dan
 danes
-datum
-december
-deset
-deseta
-deseti
-deseto
-devet
-deveta
-deveti
-deveto
 do
 dober
 dobra
@ -54,16 +40,7 @@ dobri
 dobro
 dokler
 dol
-dolg
-dolga
-dolgi
 dovolj
-drug
-druga
-drugi
-drugo
-dva
-dve
 e
 eden
 en
@ -74,7 +51,6 @@ enkrat
 eno
 etc.
 f
-februar
 g
 g.
 ga
@ -93,16 +69,12 @@ iv
 ix
 iz
 j
-januar
 jaz
 je
 ji
 jih
 jim
 jo
-julij
-junij
-jutri
 k
 kadarkoli
 kaj
@ -123,41 +95,23 @@ kje
 kjer
 kjerkoli
 ko
-koder
 koderkoli
 koga
 komu
 kot
-kratek
-kratka
-kratke
-kratki
 l
-lahka
-lahke
-lahki
-lahko
 le
 lep
 lepa
 lepe
 lepi
 lepo
-leto
 m
-maj
-majhen
-majhna
-majhni
-malce
-malo
 manj
-marec
 me
 med
 medtem
 mene
-mesec
 mi
 midva
 midve
@ -183,7 +137,6 @@ najmanj
 naju
 največ
 nam
-narobe
 nas
 nato
 nazaj
@ -192,7 +145,6 @@ naša
 naše
 ne
 nedavno
-nedelja
 nek
 neka
 nekaj
@ -236,7 +188,6 @@ njuna
 njuno
 no
 nocoj
-november
 npr.
 o
 ob
@ -244,51 +195,23 @@ oba
 obe
 oboje
 od
-odprt
-odprta
-odprti
 okoli
-oktober
 on
 onadva
 one
 oni
 onidve
-osem
-osma
-osmi
-osmo
 oz.
 p
 pa
-pet
-peta
-petek
-peti
-peto
 po
 pod
 pogosto
 poleg
-poln
-polna
-polni
-polno
 ponavadi
-ponedeljek
 ponovno
 potem
 povsod
-pozdravljen
-pozdravljeni
-prav
-prava
-prave
-pravi
-pravo
-prazen
-prazna
-prazno
 prbl.
 precej
 pred
@ -297,19 +220,10 @@ preko
 pri
 pribl.
 približno
-primer
-pripravljen
-pripravljena
-pripravljeni
 proti
-prva
-prvi
-prvo
 r
-ravno
 redko
 res
-reč
 s
 saj
 sam
@ -321,29 +235,17 @@ se
 sebe
 sebi
 sedaj
-sedem
-sedma
-sedmi
-sedmo
 sem
-september
 seveda
 si
 sicer
 skoraj
 skozi
-slab
 smo
 so
-sobota
 spet
-sreda
-srednja
-srednji
 sta
 ste
-stran
-stvar
 sva
 t
 ta
@ -358,10 +260,6 @@ te
 tebe
 tebi
 tega
-težak
-težka
-težki
-težko
 ti
 tista
 tiste
@ -371,11 +269,6 @@ tj.
 tja
 to
 toda
-torek
-tretja
-tretje
-tretji
-tri
 tu
 tudi
 tukaj
@ -392,10 +285,6 @@ vaša
 vaše
 ve
 vedno
-velik
-velika
-veliki
-veliko
 vendar
 ves
 več
@ -403,10 +292,6 @@ vi
 vidva
 vii
 viii
-visok
-visoka
-visoke
-visoki
 vsa
 vsaj
 vsak
@ -420,34 +305,21 @@ vsega
 vsi
 vso
 včasih
-včeraj
 x
 z
 za
 zadaj
 zadnji
 zakaj
-zaprta
-zaprti
-zaprto
 zdaj
 zelo
 zunaj
 č
 če
 često
-četrta
-četrtek
-četrti
-četrto
 čez
 čigav
 š
-šest
-šesta
-šesti
-šesto
-štiri
 ž
 že
 """.split()
--- a/spacy/lang/uk/tokenizer_exceptions.py
+++ b/spacy/lang/uk/tokenizer_exceptions.py
@ -6,19 +6,30 @@ from ...util import update_exc
 _exc = {}

 for exc_data in [
+    {ORTH: "обл.", NORM: "область"},
+    {ORTH: "р-н.", NORM: "район"},
+    {ORTH: "р-н", NORM: "район"},
+    {ORTH: "м.", NORM: "місто"},
    {ORTH: "вул.", NORM: "вулиця"},
-    {ORTH: "ім.", NORM: "імені"},
    {ORTH: "просп.", NORM: "проспект"},
+    {ORTH: "пр-кт", NORM: "проспект"},
    {ORTH: "бул.", NORM: "бульвар"},
    {ORTH: "пров.", NORM: "провулок"},
    {ORTH: "пл.", NORM: "площа"},
+    {ORTH: "майд.", NORM: "майдан"},
+    {ORTH: "мкр.", NORM: "мікрорайон"},
+    {ORTH: "ст.", NORM: "станція"},
+    {ORTH: "ж/м", NORM: "житловий масив"},
+    {ORTH: "наб.", NORM: "набережна"},
+    {ORTH: "в/ч", NORM: "військова частина"},
+    {ORTH: "в/м", NORM: "військове містечко"},
+    {ORTH: "оз.", NORM: "озеро"},
+    {ORTH: "ім.", NORM: "імені"},
    {ORTH: "г.", NORM: "гора"},
    {ORTH: "п.", NORM: "пан"},
-    {ORTH: "м.", NORM: "місто"},
    {ORTH: "проф.", NORM: "професор"},
    {ORTH: "акад.", NORM: "академік"},
    {ORTH: "доц.", NORM: "доцент"},
-    {ORTH: "оз.", NORM: "озеро"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]

--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@ -3,6 +3,7 @@ from libc.string cimport memcpy, memset
 from libc.stdlib cimport calloc, free
 from libc.stdint cimport uint32_t, uint64_t
 cimport libcpp
+from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from libcpp.set cimport set
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
@ -30,8 +31,8 @@ cdef cppclass StateC:
    vector[int] _stack
    vector[int] _rebuffer
    vector[SpanC] _ents
-    vector[ArcC] _left_arcs
-    vector[ArcC] _right_arcs
+    unordered_map[int, vector[ArcC]] _left_arcs
+    unordered_map[int, vector[ArcC]] _right_arcs
    vector[libcpp.bool] _unshiftable
    set[int] _sent_starts
    TokenC _empty_token
@ -160,15 +161,22 @@ cdef cppclass StateC:
        else:
            return &this._sent[i]

-    void get_arcs(vector[ArcC]* arcs) nogil const:
-        for i in range(this._left_arcs.size()):
-            arc = this._left_arcs.at(i)
-            if arc.head != -1 and arc.child != -1:
-                arcs.push_back(arc)
-        for i in range(this._right_arcs.size()):
-            arc = this._right_arcs.at(i)
-            if arc.head != -1 and arc.child != -1:
-                arcs.push_back(arc)
+    void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) nogil const:
+        cdef const vector[ArcC]* arcs
+        head_arcs_it = heads_arcs.const_begin()
+        while head_arcs_it != heads_arcs.const_end():
+            arcs = &deref(head_arcs_it).second
+            arcs_it = arcs.const_begin()
+            while arcs_it != arcs.const_end():
+                arc = deref(arcs_it)
+                if arc.head != -1 and arc.child != -1:
+                    out.push_back(arc)
+                incr(arcs_it)
+            incr(head_arcs_it)
+
+    void get_arcs(vector[ArcC]* out) nogil const:
+        this.map_get_arcs(this._left_arcs, out)
+        this.map_get_arcs(this._right_arcs, out)

    int H(int child) nogil const:
        if child >= this.length or child < 0:
@ -182,37 +190,35 @@ cdef cppclass StateC:
        else:
            return this._ents.back().start

-    int L(int head, int idx) nogil const:
-        if idx < 1 or this._left_arcs.size() == 0:
+    int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) nogil const:
+        if idx < 1:
            return -1

-        # Work backwards through left-arcs to find the arc at the
+        head_arcs_it = heads_arcs.const_find(head)
+        if head_arcs_it == heads_arcs.const_end():
+            return -1
+
+        cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
+
+        # Work backwards through arcs to find the arc at the
        # requested index more quickly.
        cdef size_t child_index = 0
-        it = this._left_arcs.const_rbegin()
-        while it != this._left_arcs.rend():
-            arc = deref(it)
-            if arc.head == head and arc.child != -1 and arc.child < head:
+        arcs_it = arcs.const_rbegin()
+        while arcs_it != arcs.const_rend() and child_index != idx:
+            arc = deref(arcs_it)
+            if arc.child != -1:
                child_index += 1
                if child_index == idx:
                    return arc.child
-            incr(it)
+            incr(arcs_it)

        return -1

+    int L(int head, int idx) nogil const:
+        return this.nth_child(this._left_arcs, head, idx)
+
    int R(int head, int idx) nogil const:
-        if idx < 1 or this._right_arcs.size() == 0:
-            return -1
-        cdef vector[int] rights
-        for i in range(this._right_arcs.size()):
-            arc = this._right_arcs.at(i)
-            if arc.head == head and arc.child != -1 and arc.child > head:
-                rights.push_back(arc.child)
-        idx = (<int>rights.size()) - idx
-        if idx < 0:
-            return -1
-        else:
-            return rights.at(idx)
+        return this.nth_child(this._right_arcs, head, idx)

    bint empty() nogil const:
        return this._stack.size() == 0
@ -253,22 +259,29 @@ cdef cppclass StateC:

    int r_edge(int word) nogil const:
        return word
- 
-    int n_L(int head) nogil const:
+
+    int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) nogil const:
        cdef int n = 0
-        for i in range(this._left_arcs.size()):
-            arc = this._left_arcs.at(i) 
-            if arc.head == head and arc.child != -1 and arc.child < arc.head:
+        head_arcs_it = heads_arcs.const_find(head)
+        if head_arcs_it == heads_arcs.const_end():
+            return n
+
+        cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
+        arcs_it = arcs.const_begin()
+        while arcs_it != arcs.end():
+            arc = deref(arcs_it)
+            if arc.child != -1:
                n += 1
+            incr(arcs_it)
+
        return n

+
+    int n_L(int head) nogil const:
+        return n_arcs(this._left_arcs, head)
+
    int n_R(int head) nogil const:
-        cdef int n = 0
-        for i in range(this._right_arcs.size()):
-            arc = this._right_arcs.at(i) 
-            if arc.head == head and arc.child != -1 and arc.child > arc.head:
-                n += 1
-        return n
+        return n_arcs(this._right_arcs, head)

    bint stack_is_connected() nogil const:
        return False
@ -328,19 +341,20 @@ cdef cppclass StateC:
        arc.child = child
        arc.label = label
        if head > child:
-            this._left_arcs.push_back(arc)
+            this._left_arcs[arc.head].push_back(arc)
        else:
-            this._right_arcs.push_back(arc)
+            this._right_arcs[arc.head].push_back(arc)
        this._heads[child] = head

-    void del_arc(int h_i, int c_i) nogil:
-        cdef vector[ArcC]* arcs
-        if h_i > c_i:
-            arcs = &this._left_arcs
-        else:
-            arcs = &this._right_arcs
+    void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
+        arcs_it = heads_arcs.find(h_i)
+        if arcs_it == heads_arcs.end():
+            return
+
+        arcs = &deref(arcs_it).second
        if arcs.size() == 0:
            return
+
        arc = arcs.back()
        if arc.head == h_i and arc.child == c_i:
            arcs.pop_back()
@ -353,6 +367,12 @@ cdef cppclass StateC:
                    arc.label = 0
                    break

+    void del_arc(int h_i, int c_i) nogil:
+        if h_i > c_i:
+            this.map_del_arc(&this._left_arcs, h_i, c_i)
+        else:
+            this.map_del_arc(&this._right_arcs, h_i, c_i)
+
    SpanC get_ent() nogil const:
        cdef SpanC ent
        if this._ents.size() == 0:
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -1,6 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
-from itertools import islice
 from typing import Optional, Callable
+from itertools import islice

 import srsly
 from thinc.api import Model, SequenceCategoricalCrossentropy, Config
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -1,9 +1,10 @@
-import numpy
 from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d, Ints1d

+import numpy
+
 from ..compat import Protocol, runtime_checkable
 from ..scorer import Scorer
 from ..language import Language
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -1,8 +1,8 @@
-from itertools import islice
 from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any
 from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
 from thinc.types import Floats2d
 import numpy
+from itertools import islice

 from .trainable_pipe import TrainablePipe
 from ..language import Language
@ -158,6 +158,13 @@ class TextCategorizer(TrainablePipe):
        self.cfg = dict(cfg)
        self.scorer = scorer

+    @property
+    def support_missing_values(self):
+        # There are no missing values as the textcat should always
+        # predict exactly one label. All other labels are 0.0
+        # Subclasses may override this property to change internal behaviour.
+        return False
+
    @property
    def labels(self) -> Tuple[str]:
        """RETURNS (Tuple[str]): The labels currently added to the component.
@ -294,7 +301,7 @@ class TextCategorizer(TrainablePipe):
            for j, label in enumerate(self.labels):
                if label in eg.reference.cats:
                    truths[i, j] = eg.reference.cats[label]
-                else:
+                elif self.support_missing_values:
                    not_missing[i, j] = 0.0
        truths = self.model.ops.asarray(truths)  # type: ignore
        return truths, not_missing  # type: ignore
@ -313,9 +320,9 @@ class TextCategorizer(TrainablePipe):
        self._validate_categories(examples)
        truths, not_missing = self._examples_to_truth(examples)
        not_missing = self.model.ops.asarray(not_missing)  # type: ignore
-        d_scores = (scores - truths) / scores.shape[0]
+        d_scores = (scores - truths)
        d_scores *= not_missing
-        mean_square_error = (d_scores**2).sum(axis=1).mean()
+        mean_square_error = (d_scores**2).mean()
        return float(mean_square_error), d_scores

    def add_label(self, label: str) -> int:
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -1,8 +1,8 @@
-from itertools import islice
 from typing import Iterable, Optional, Dict, List, Callable, Any
-
-from thinc.api import Model, Config
 from thinc.types import Floats2d
+from thinc.api import Model, Config
+
+from itertools import islice

 from ..language import Language
 from ..training import Example, validate_get_examples
@ -158,6 +158,10 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        self.cfg = dict(cfg)
        self.scorer = scorer

+    @property
+    def support_missing_values(self):
+        return True
+
    def initialize(  # type: ignore[override]
        self,
        get_examples: Callable[[], Iterable[Example]],
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -445,7 +445,8 @@ class Scorer:
            getter(doc, attr) should return the values for the individual doc.
        labels (Iterable[str]): The set of possible labels. Defaults to [].
        multi_label (bool): Whether the attribute allows multiple labels.
-            Defaults to True.
+            Defaults to True. When set to False (exclusive labels), missing
+            gold labels are interpreted as 0.0.
        positive_label (str): The positive label for a binary task with
            exclusive classes. Defaults to None.
        threshold (float): Cutoff to consider a prediction "positive". Defaults
@ -484,13 +485,15 @@ class Scorer:

            for label in labels:
                pred_score = pred_cats.get(label, 0.0)
-                gold_score = gold_cats.get(label, 0.0)
+                gold_score = gold_cats.get(label)
+                if not gold_score and not multi_label:
+                    gold_score = 0.0
                if gold_score is not None:
                    auc_per_type[label].score_set(pred_score, gold_score)
            if multi_label:
                for label in labels:
                    pred_score = pred_cats.get(label, 0.0)
-                    gold_score = gold_cats.get(label, 0.0)
+                    gold_score = gold_cats.get(label)
                    if gold_score is not None:
                        if pred_score >= threshold and gold_score > 0:
                            f_per_type[label].tp += 1
@ -502,16 +505,15 @@ class Scorer:
                # Get the highest-scoring for each.
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
-                if gold_score is not None:
-                    if pred_label == gold_label and pred_score >= threshold:
-                        f_per_type[pred_label].tp += 1
-                    else:
-                        f_per_type[gold_label].fn += 1
-                        if pred_score >= threshold:
-                            f_per_type[pred_label].fp += 1
+                if pred_label == gold_label and pred_score >= threshold:
+                    f_per_type[pred_label].tp += 1
+                else:
+                    f_per_type[gold_label].fn += 1
+                    if pred_score >= threshold:
+                        f_per_type[pred_label].fp += 1
            elif gold_cats:
                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
-                if gold_score is not None and gold_score > 0:
+                if gold_score > 0:
                    f_per_type[gold_label].fn += 1
            elif pred_cats:
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -155,6 +155,11 @@ def fr_tokenizer():
    return get_lang_class("fr")().tokenizer


+@pytest.fixture(scope="session")
+def fr_vocab():
+    return get_lang_class("fr")().vocab
+
+
@pytest.fixture(scope="session")
 def ga_tokenizer():
    return get_lang_class("ga")().tokenizer
@ -205,6 +210,11 @@ def it_tokenizer():
    return get_lang_class("it")().tokenizer


+@pytest.fixture(scope="session")
+def it_vocab():
+    return get_lang_class("it")().vocab
+
+
@pytest.fixture(scope="session")
 def ja_tokenizer():
    pytest.importorskip("sudachipy")
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -573,6 +573,55 @@ def test_span_with_vectors(doc):
    doc.vocab.vectors = prev_vectors


+# fmt: off
+def test_span_comparison(doc):
+
+    # Identical start, end, only differ in label and kb_id
+    assert Span(doc, 0, 3) == Span(doc, 0, 3)
+    assert Span(doc, 0, 3, "LABEL") == Span(doc, 0, 3, "LABEL")
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") == Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+
+    assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL")
+    assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+    assert Span(doc, 0, 3, "LABEL") != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+
+    assert Span(doc, 0, 3) <= Span(doc, 0, 3) and Span(doc, 0, 3) >= Span(doc, 0, 3)
+    assert Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL") and Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "LABEL")
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+
+    assert (Span(doc, 0, 3) < Span(doc, 0, 3, "", kb_id="KB_ID") < Span(doc, 0, 3, "LABEL") < Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
+    assert (Span(doc, 0, 3) <= Span(doc, 0, 3, "", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
+
+    assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") > Span(doc, 0, 3, "LABEL") > Span(doc, 0, 3, "", kb_id="KB_ID") > Span(doc, 0, 3))
+    assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "", kb_id="KB_ID") >= Span(doc, 0, 3))
+
+    # Different end
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
+
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4)
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 4)
+    assert Span(doc, 0, 4) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+    assert Span(doc, 0, 4) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+
+    # Different start
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
+
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
+    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
+    assert Span(doc, 1, 3) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+    assert Span(doc, 1, 3) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
+
+    # Different start & different end
+    assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
+
+    assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
+    assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
+    assert Span(doc, 1, 3) > Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
+    assert Span(doc, 1, 3) >= Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
+# fmt: on
+
+
@pytest.mark.parametrize(
    "start,end,expected_sentences,expected_sentences_with_hook",
    [
--- a/spacy/tests/lang/fi/test_noun_chunks.py
+++ b/spacy/tests/lang/fi/test_noun_chunks.py
@ -0,0 +1,174 @@
+import pytest
+from spacy.tokens import Doc
+
+
+FI_NP_TEST_EXAMPLES = [
+    (
+        "Kaksi tyttöä potkii punaista palloa",
+        ["NUM", "NOUN", "VERB", "ADJ", "NOUN"],
+        ["nummod", "nsubj", "ROOT", "amod", "obj"],
+        [1, 1, 0, 1, -2],
+        ["Kaksi tyttöä", "punaista palloa"],
+    ),
+    (
+        "Erittäin vaarallinen leijona karkasi kiertävän sirkuksen eläintenkesyttäjältä",
+        ["ADV", "ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"],
+        ["advmod", "amod", "nsubj", "ROOT", "amod", "nmod:poss", "obl"],
+        [1, 1, 1, 0, 1, 1, -3],
+        ["Erittäin vaarallinen leijona", "kiertävän sirkuksen eläintenkesyttäjältä"],
+    ),
+    (
+        "Leijona raidallisine tassuineen piileksii Porin kaupungin lähellä",
+        ["NOUN", "ADJ", "NOUN", "VERB", "PROPN", "NOUN", "ADP"],
+        ["nsubj", "amod", "nmod", "ROOT", "nmod:poss", "obl", "case"],
+        [3, 1, -2, 0, 1, -2, -1],
+        ["Leijona raidallisine tassuineen", "Porin kaupungin"],
+    ),
+    (
+        "Lounaalla nautittiin salaattia, maukasta kanaa ja raikasta vettä",
+        ["NOUN", "VERB", "NOUN", "PUNCT", "ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"],
+        ["obl", "ROOT", "obj", "punct", "amod", "conj", "cc", "amod", "conj"],
+        [1, 0, -1, 2, 1, -3, 2, 1, -6],
+        ["Lounaalla", "salaattia", "maukasta kanaa", "raikasta vettä"],
+    ),
+    (
+        "Minua houkuttaa maalle muuttaminen talven jälkeen",
+        ["PRON", "VERB", "NOUN", "NOUN", "NOUN", "ADP"],
+        ["obj", "ROOT", "nmod", "nsubj", "obl", "case"],
+        [1, 0, 1, -2, -3, -1],
+        ["maalle muuttaminen", "talven"],
+    ),
+    (
+        "Päivän kohokohta oli vierailu museossa kummilasten kanssa",
+        ["NOUN", "NOUN", "AUX", "NOUN", "NOUN", "NOUN", "ADP"],
+        ["nmod:poss", "nsubj:cop", "cop", "ROOT", "nmod", "obl", "case"],
+        [1, 2, 1, 0, -1, -2, -1],
+        ["Päivän kohokohta", "vierailu museossa", "kummilasten"],
+    ),
+    (
+        "Yrittäjät maksoivat tuomioistuimen määräämät korvaukset",
+        ["NOUN", "VERB", "NOUN", "VERB", "NOUN"],
+        ["nsubj", "ROOT", "nsubj", "acl", "obj"],
+        [1, 0, 1, 1, -3],
+        ["Yrittäjät", "tuomioistuimen", "korvaukset"],
+    ),
+    (
+        "Julkisoikeudelliset tai niihin rinnastettavat saatavat ovat suoraan ulosottokelpoisia",
+        ["ADJ", "CCONJ", "PRON", "VERB", "NOUN", "AUX", "ADV", "NOUN"],
+        ["amod", "cc", "obl", "acl", "nsubj:cop", "cop", "advmod", "ROOT"],
+        [4, 3, 1, 1, 3, 2, 1, 0],
+        ["Julkisoikeudelliset tai niihin rinnastettavat saatavat", "ulosottokelpoisia"],
+    ),
+    (
+        "Se oli ala-arvoista käytöstä kaikilta oppilailta, myös valvojaoppilailta",
+        ["PRON", "AUX", "ADJ", "NOUN", "PRON", "NOUN", "PUNCT", "ADV", "NOUN"],
+        ["nsubj:cop", "cop", "amod", "ROOT", "det", "nmod", "punct", "advmod", "appos"],
+        [3, 2, 1, 0, 1, -2, 2, 1, -3],
+        ["ala-arvoista käytöstä kaikilta oppilailta", "valvojaoppilailta"],
+    ),
+    (
+        "Isä souti veneellä, jonka hän oli vuokrannut",
+        ["NOUN", "VERB", "NOUN", "PUNCT", "PRON", "PRON", "AUX", "VERB"],
+        ["nsubj", "ROOT", "obl", "punct", "obj", "nsubj", "aux", "acl:relcl"],
+        [1, 0, -1, 4, 3, 2, 1, -5],
+        ["Isä", "veneellä"],
+    ),
+    (
+        "Kirja, jonka poimin hyllystä, kertoo norsuista",
+        ["NOUN", "PUNCT", "PRON", "VERB", "NOUN", "PUNCT", "VERB", "NOUN"],
+        ["nsubj", "punct", "obj", "acl:relcl", "obl", "punct", "ROOT", "obl"],
+        [6, 2, 1, -3, -1, 1, 0, -1],
+        ["Kirja", "hyllystä", "norsuista"],
+    ),
+    (
+        "Huomenna on päivä, jota olemme odottaneet",
+        ["NOUN", "AUX", "NOUN", "PUNCT", "PRON", "AUX", "VERB"],
+        ["ROOT", "cop", "nsubj:cop", "punct", "obj", "aux", "acl:relcl"],
+        [0, -1, -2, 3, 2, 1, -4],
+        ["Huomenna", "päivä"],
+    ),
+    (
+        "Liikkuvuuden lisääminen on yksi korkeakoulutuksen keskeisistä kehittämiskohteista",
+        ["NOUN", "NOUN", "AUX", "PRON", "NOUN", "ADJ", "NOUN"],
+        ["nmod:gobj", "nsubj:cop", "cop", "ROOT", "nmod:poss", "amod", "nmod"],
+        [1, 2, 1, 0, 2, 1, -3],
+        [
+            "Liikkuvuuden lisääminen",
+            "korkeakoulutuksen keskeisistä kehittämiskohteista",
+        ],
+    ),
+    (
+        "Kaupalliset palvelut jätetään yksityisten palveluntarjoajien tarjottavaksi",
+        ["ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"],
+        ["amod", "obj", "ROOT", "amod", "nmod:gsubj", "obl"],
+        [1, 1, 0, 1, 1, -3],
+        ["Kaupalliset palvelut", "yksityisten palveluntarjoajien tarjottavaksi"],
+    ),
+    (
+        "New York tunnetaan kaupunkina, joka ei koskaan nuku",
+        ["PROPN", "PROPN", "VERB", "NOUN", "PUNCT", "PRON", "AUX", "ADV", "VERB"],
+        ["obj", "flat:name", "ROOT", "obl", "punct", "nsubj", "aux", "advmod", "acl:relcl"],
+        [2, -1, 0, -1, 4, 3, 2, 1, -5],
+        ["New York", "kaupunkina"],
+    ),
+    (
+        "Loput vihjeet saat herra Möttöseltä",
+        ["NOUN", "NOUN", "VERB", "NOUN", "PROPN"],
+        ["compound:nn", "obj", "ROOT", "compound:nn", "obj"],
+        [1, 1, 0, 1, -2],
+        ["Loput vihjeet", "herra Möttöseltä"],
+    ),
+    (
+        "mahdollisuus tukea muita päivystysyksiköitä",
+        ["NOUN", "VERB", "PRON", "NOUN"],
+        ["ROOT", "acl", "det", "obj"],
+        [0, -1, 1, -2],
+        ["mahdollisuus", "päivystysyksiköitä"],
+    ),
+    (
+        "sairaanhoitopiirit harjoittavat leikkaustoimintaa alueellaan useammassa sairaalassa",
+        ["NOUN", "VERB", "NOUN", "NOUN", "ADJ", "NOUN"],
+        ["nsubj", "ROOT", "obj", "obl", "amod", "obl"],
+        [1, 0, -1, -1, 1, -3],
+        ["sairaanhoitopiirit", "leikkaustoimintaa", "alueellaan", "useammassa sairaalassa"],
+    ),
+    (
+        "Lain mukaan varhaiskasvatus on suunnitelmallista toimintaa",
+        ["NOUN", "ADP", "NOUN", "AUX", "ADJ", "NOUN"],
+        ["obl", "case", "nsubj:cop", "cop", "amod", "ROOT"],
+        [5, -1, 3, 2, 1, 0],
+        ["Lain", "varhaiskasvatus", "suunnitelmallista toimintaa"],
+    ),
+]
+
+
+def test_noun_chunks_is_parsed(fi_tokenizer):
+    """Test that noun_chunks raises Value Error for 'fi' language if Doc is not parsed.
+    To check this test, we're constructing a Doc
+    with a new Vocab here and forcing is_parsed to 'False'
+    to make sure the noun chunks don't run.
+    """
+    doc = fi_tokenizer("Tämä on testi")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)
+
+
+@pytest.mark.parametrize(
+    "text,pos,deps,heads,expected_noun_chunks", FI_NP_TEST_EXAMPLES
+)
+def test_fi_noun_chunks(fi_tokenizer, text, pos, deps, heads, expected_noun_chunks):
+    tokens = fi_tokenizer(text)
+
+    assert len(heads) == len(pos)
+    doc = Doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=[head + i for i, head in enumerate(heads)],
+        deps=deps,
+        pos=pos,
+    )
+
+    noun_chunks = list(doc.noun_chunks)
+    assert len(noun_chunks) == len(expected_noun_chunks)
+    for i, np in enumerate(noun_chunks):
+        assert np.text == expected_noun_chunks[i]
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@ -1,8 +1,230 @@
+from spacy.tokens import Doc
 import pytest


+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # determiner + noun
+        # un nom -> un nom
+        (
+            ["un", "nom"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # determiner + noun starting with vowel
+        # l'heure -> l'heure
+        (
+            ["l'", "heure"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # determiner + plural noun
+        # les romans -> les romans
+        (
+            ["les", "romans"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # det + adj + noun
+        # Le vieux Londres  -> Le vieux Londres 
+        (
+            ['Les', 'vieux', 'Londres'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # det + noun + adj
+        # le nom propre  -> le nom propre   a proper noun
+        (
+            ["le", "nom", "propre"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # det + noun + adj plural
+        # Les chiens bruns  -> les chiens bruns
+        (
+            ["Les", "chiens", "bruns"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # multiple adjectives: one adj before the noun, one adj after the noun
+        # un nouveau film intéressant -> un nouveau film intéressant
+        (
+            ["un", "nouveau", "film", "intéressant"],
+            [2, 2, 2, 2],
+            ["det", "amod", "ROOT", "amod"],
+            ["DET", "ADJ", "NOUN", "ADJ"],
+            [(0,4)]
+        ),
+        # multiple adjectives, both adjs after the noun
+        # une personne intelligente et drôle -> une personne intelligente et drôle
+        (
+            ["une", "personne", "intelligente", "et", "drôle"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # relative pronoun
+        # un bus qui va au ville -> un bus, qui, ville
+        (
+            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
+            [1, 1, 3, 1, 5, 3],
+            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
+            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
+            [(0,2), (2,3), (5,6)]
+        ),
+        # relative subclause
+        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
+        (
+            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
+            [0, 2, 0, 5, 5, 2, 5],
+            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
+            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
+            [(1,3), (4,5)]
+        ),
+        # Person name and title by flat
+        # Louis XIV -> Louis XIV
+        (
+            ["Louis", "XIV"],
+            [0, 0],
+            ["ROOT", "flat:name"],
+            ["PROPN", "PROPN"],
+            [(0,2)]
+        ),
+        # Organization name by flat
+        # Nations Unies -> Nations Unies
+        (
+            ["Nations", "Unies"],
+            [0, 0],
+            ["ROOT", "flat:name"],
+            ["PROPN", "PROPN"],
+            [(0,2)]
+        ),
+        # Noun compound, person name created by two flats
+        # Louise de Bratagne -> Louise de Bratagne
+        (
+            ["Louise", "de", "Bratagne"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Noun compound, person name created by two flats
+        # Louis François Joseph -> Louis François Joseph
+        (
+            ["Louis", "François", "Joseph"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # one determiner + one noun + one adjective qualified by an adverb
+        # quelques agriculteurs très riches -> quelques agriculteurs très riches
+        (
+            ["quelques", "agriculteurs", "très", "riches"],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'advmod', 'amod'],
+            ['DET', 'NOUN', 'ADV', 'ADJ'],
+            [(0,4)]
+        ),
+        # Two NPs conjuncted
+        # Il a un chien et un chat -> Il, un chien, un chat
+        ( 
+            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
+            [1, 1, 3, 1, 6, 6, 3],
+            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,1), (2,4), (5,7)]
+         
+        ),
+        # Two NPs together
+        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
+        (
+            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
+            [1, 1, 1, 1, 3],
+            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
+            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+            [(0, 3), (3, 5)]
+        ),
+        # nmod relation between NPs
+        # la destruction de la ville -> la destruction, la ville
+        (
+            ['la', 'destruction', 'de', 'la', 'ville'],
+            [1, 1, 4, 4, 1],
+            ['det', 'ROOT', 'case', 'det', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
+            [(0,2), (3,5)]
+        ),
+        # nmod relation between NPs
+        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
+        (
+            ['Archiduchesse', 'd’', 'Autriche'],
+            [0, 2, 0],
+            ['ROOT', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3)]
+        ),
+        # Compounding by nmod, several NPs chained together
+        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
+        (
+            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
+            [2, 2, 2, 4, 2, 6, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(0, 3), (4, 5), (6, 7)]
+        ),
+        # several NPs
+        # Traduction du rapport de Susana -> Traduction, rapport, Susana
+        (
+            ['Traduction', 'du', 'raport', 'de', 'Susana'],
+            [0, 2, 0, 4, 2],
+            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3), (4,5)]  
+       
+        ),
+        # Several NPs
+        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
+        (  
+            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
+            [2, 2, 2, 4, 2, 7, 7, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,8)]
+        ),
+        # Passive subject
+        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
+        (
+            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
+            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
+            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
+            [(0, 3), (6, 10), (11, 12)]
+        )
+    ],
+)
+# fmt: on
+def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
-    doc = fr_tokenizer("trouver des travaux antérieurs")
+    doc = fr_tokenizer("Je suis allé à l'école")
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/it/test_noun_chunks.py
+++ b/spacy/tests/lang/it/test_noun_chunks.py
@ -0,0 +1,221 @@
+from spacy.tokens import Doc
+import pytest
+
+
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # determiner + noun
+        # un pollo -> un pollo
+        (
+            ["un", "pollo"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0,2)],
+        ),
+        # two determiners + noun
+        # il mio cane -> il mio cane
+        (
+            ["il", "mio", "cane"],
+            [2, 2, 2],
+            ["det", "det:poss", "ROOT"],
+            ["DET", "DET", "NOUN"],
+            [(0,3)],
+        ),
+        # two determiners, one is after noun. rare usage but still testing
+        # il cane mio-> il cane mio
+        (
+            ["il", "cane", "mio"],
+            [1, 1, 1],
+            ["det", "ROOT", "det:poss"],
+            ["DET", "NOUN", "DET"],
+            [(0,3)],
+        ),
+        # relative pronoun
+        # È molto bello il vestito che hai acquistat -> il vestito, che   the dress that you bought is very pretty.
+        (
+            ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
+            [2, 2, 2, 4, 2, 7, 7, 4],
+            ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
+            ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
+            [(3,5), (5,6)]
+        ),
+        # relative subclause
+        # il computer che hai comprato -> il computer, che     the computer that you bought
+        (
+            ['il', 'computer', 'che', 'hai', 'comprato'],
+            [1, 1, 4, 4, 1],
+            ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
+            ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
+            [(0,2), (2,3)]
+        ),
+        # det + noun + adj
+        # Una macchina grande  -> Una macchina grande
+        (
+            ["Una", "macchina", "grande"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0,3)],
+        ),
+        # noun + adj plural
+        # mucche bianche 
+        (
+            ["mucche", "bianche"],
+            [0, 0],
+            ["ROOT", "amod"],
+            ["NOUN", "ADJ"],
+            [(0,2)],
+        ),
+        # det + adj + noun
+        # Una grande macchina -> Una grande macchina
+        (
+            ['Una', 'grande', 'macchina'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # det + adj + noun, det with apostrophe
+        # un'importante associazione -> un'importante associazione
+        (
+            ["Un'", 'importante', 'associazione'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # multiple adjectives
+        # Un cane piccolo e marrone -> Un cane piccolo e marrone
+        (
+            ["Un", "cane", "piccolo", "e", "marrone"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # determiner, adjective, compound created by flat
+        # le Nazioni Unite -> le Nazioni Unite
+        (
+            ["le", "Nazioni", "Unite"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat:name"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # one determiner + one noun + one adjective qualified by an adverb
+        # alcuni contadini molto ricchi -> alcuni contadini molto ricchi     some very rich farmers
+        (
+            ['alcuni', 'contadini', 'molto', 'ricchi'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'advmod', 'amod'],
+            ['DET', 'NOUN', 'ADV', 'ADJ'],
+            [(0,4)]
+        ),
+        # Two NPs conjuncted
+        # Ho un cane e un gatto -> un cane, un gatto
+        ( 
+            ['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
+            [0, 2, 0, 5, 5, 0],
+            ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+            ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+            [(1,3), (4,6)]
+         
+        ),
+        # Two NPs together
+        # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
+        (
+            ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
+            [1, 1, 1, 1, 3],
+            ['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
+            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+            [(0, 3), (3, 5)]
+        ),
+        # Noun compound, person name and titles
+        # Dom Pedro II -> Dom Pedro II
+        (
+            ["Dom", "Pedro", "II"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Noun compound created by flat
+        # gli Stati Uniti
+        (
+            ["gli", "Stati", "Uniti"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat:name"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # nmod relation between NPs
+        # la distruzione della città -> la distruzione, città
+        (
+            ['la', 'distruzione', 'della', 'città'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'case', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'NOUN'],
+            [(0,2), (3,4)]
+        ),
+        # Compounding by nmod, several NPs chained together
+        # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
+        (
+            ["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
+            [2, 2, 2, 4, 2, 6, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(0, 3), (4, 5), (6, 7)]
+        ),
+        # several NPs
+        # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
+        (
+            ['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
+            [0, 2, 0, 4, 2],
+            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3), (4,5)]  
+       
+        ),
+        # Several NPs
+        # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
+        (  
+            ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
+            [1, 1, 1, 4, 1, 8, 8, 8, 1],
+            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
+            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,9)]
+        ),
+        # Passive subject
+        # La nuova spesa è alimentata dal grande conto in banca di Clinton  -> Le nuova spesa, grande conto, banca, Clinton
+        (
+            ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
+            [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
+            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0, 3), (6, 8), (9, 10), (11,12)]
+        ),
+        # Misc
+        # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
+        (
+            ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
+            [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
+            ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
+            ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(2,4), (9,12), (13,14), (17,18), (19,20)]
+        )
+    ],
+)
+# fmt: on
+def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
+def test_noun_chunks_is_parsed_it(it_tokenizer):
+    """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
+    doc = it_tokenizer("Sei andato a Oxford")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)
--- a/spacy/tests/lang/it/test_stopwords.py
+++ b/spacy/tests/lang/it/test_stopwords.py
@ -0,0 +1,17 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "word", ["un", "lo", "dell", "dall", "si", "ti", "mi", "quest", "quel", "quello"]
+)
+def test_stopwords_basic(it_tokenizer, word):
+    tok = it_tokenizer(word)[0]
+    assert tok.is_stop
+
+
+@pytest.mark.parametrize(
+    "word", ["quest'uomo", "l'ho", "un'amica", "dell'olio", "s'arrende", "m'ascolti"]
+)
+def test_stopwords_elided(it_tokenizer, word):
+    tok = it_tokenizer(word)[0]
+    assert tok.is_stop
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -277,6 +277,21 @@ def test_issue7019():
    print_prf_per_type(msg, scores, name="foo", type="bar")


+@pytest.mark.issue(9904)
+def test_issue9904():
+    nlp = Language()
+    textcat = nlp.add_pipe("textcat")
+    get_examples = make_get_examples_single_label(nlp)
+    nlp.initialize(get_examples)
+
+    examples = get_examples()
+    scores = textcat.predict([eg.predicted for eg in examples])
+
+    loss = textcat.get_loss(examples, scores)[0]
+    loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
+    assert loss == pytest.approx(loss_double_bs)
+
+
@pytest.mark.skip(reason="Test is flakey when run with others")
 def test_simple_train():
    nlp = Language()
@ -725,6 +740,72 @@ def test_textcat_evaluation():
    assert scores["cats_micro_r"] == 4 / 6


+@pytest.mark.parametrize(
+    "multi_label,spring_p",
+    [(True, 1 / 1), (False, 1 / 2)],
+)
+def test_textcat_eval_missing(multi_label: bool, spring_p: float):
+    """
+    multi-label: the missing 'spring' in gold_doc_2 doesn't incur a penalty
+    exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0"""
+    train_examples = []
+    nlp = English()
+
+    ref1 = nlp("one")
+    ref1.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
+    pred1 = nlp("one")
+    pred1.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
+    train_examples.append(Example(ref1, pred1))
+
+    ref2 = nlp("two")
+    # reference 'spring' is missing, pred 'spring' is 1
+    ref2.cats = {"winter": 0.0, "summer": 0.0, "autumn": 1.0}
+    pred2 = nlp("two")
+    pred2.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
+    train_examples.append(Example(pred2, ref2))
+
+    scores = Scorer().score_cats(
+        train_examples,
+        "cats",
+        labels=["winter", "summer", "spring", "autumn"],
+        multi_label=multi_label,
+    )
+    assert scores["cats_f_per_type"]["spring"]["p"] == spring_p
+    assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 1
+
+
+@pytest.mark.parametrize(
+    "multi_label,expected_loss",
+    [(True, 0), (False, 0.125)],
+)
+def test_textcat_loss(multi_label: bool, expected_loss: float):
+    """
+    multi-label: the missing 'spring' in gold_doc_2 doesn't incur an increase in loss
+    exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss"""
+    train_examples = []
+    nlp = English()
+
+    doc1 = nlp("one")
+    cats1 = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
+    train_examples.append(Example.from_dict(doc1, {"cats": cats1}))
+
+    doc2 = nlp("two")
+    cats2 = {"winter": 0.0, "summer": 0.0, "autumn": 1.0}
+    train_examples.append(Example.from_dict(doc2, {"cats": cats2}))
+
+    if multi_label:
+        textcat = nlp.add_pipe("textcat_multilabel")
+    else:
+        textcat = nlp.add_pipe("textcat")
+    textcat.initialize(lambda: train_examples)
+    assert isinstance(textcat, TextCategorizer)
+    scores = textcat.model.ops.asarray(
+        [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0]], dtype="f"  # type: ignore
+    )
+    loss, d_scores = textcat.get_loss(train_examples, scores)
+    assert loss == expected_loss
+
+
 def test_textcat_threshold():
    # Ensure the scorer can be called with a different threshold
    nlp = English()
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@ -421,3 +421,13 @@ def test_Example_missing_heads():
    # Ensure that the missing head doesn't create an artificial new sentence start
    expected = [True, False, False, False, False, False]
    assert example.get_aligned_sent_starts() == expected
+
+
+def test_Example_aligned_whitespace(en_vocab):
+    words = ["a", " ", "b"]
+    tags = ["A", "SPACE", "B"]
+    predicted = Doc(en_vocab, words=words)
+    reference = Doc(en_vocab, words=words, tags=tags)
+
+    example = Example(predicted, reference)
+    assert example.get_aligned("TAG", as_string=True) == tags
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -126,38 +126,26 @@ cdef class Span:
                return False
            else:
                return True
+        self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.doc)
+        other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.doc)
        # <
        if op == 0:
-            return self.c.start_char < other.c.start_char
+            return self_tuple < other_tuple
        # <=
        elif op == 1:
-            return self.c.start_char <= other.c.start_char
+            return self_tuple <= other_tuple
        # ==
        elif op == 2:
-            # Do the cheap comparisons first
-            return (
-                (self.c.start_char == other.c.start_char) and \
-                (self.c.end_char == other.c.end_char) and \
-                (self.c.label == other.c.label) and \
-                (self.c.kb_id == other.c.kb_id) and \
-                (self.doc == other.doc)
-            )
+            return self_tuple == other_tuple
        # !=
        elif op == 3:
-            # Do the cheap comparisons first
-            return not (
-                (self.c.start_char == other.c.start_char) and \
-                (self.c.end_char == other.c.end_char) and \
-                (self.c.label == other.c.label) and \
-                (self.c.kb_id == other.c.kb_id) and \
-                (self.doc == other.doc)
-            )
+            return self_tuple != other_tuple
        # >
        elif op == 4:
-            return self.c.start_char > other.c.start_char
+            return self_tuple > other_tuple
        # >=
        elif op == 5:
-            return self.c.start_char >= other.c.start_char
+            return self_tuple >= other_tuple

    def __hash__(self):
        return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id))
--- a/spacy/training/converters/conllu_to_docs.py
+++ b/spacy/training/converters/conllu_to_docs.py
@ -188,6 +188,7 @@ def conllu_sentence_to_doc(
        id_ = int(id_) - 1
        head = (int(head) - 1) if head not in ("0", "_") else id_
        tag = pos if tag == "_" else tag
+        pos = pos if pos != "_" else ""
        morph = morph if morph != "_" else ""
        dep = "ROOT" if dep == "root" else dep
        lemmas.append(lemma)
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -159,20 +159,17 @@ cdef class Example:
        gold_values = self.reference.to_array([field])
        output = [None] * len(self.predicted)
        for token in self.predicted:
-            if token.is_space:
+            values = gold_values[align[token.i].dataXd]
+            values = values.ravel()
+            if len(values) == 0:
                output[token.i] = None
+            elif len(values) == 1:
+                output[token.i] = values[0]
+            elif len(set(list(values))) == 1:
+                # If all aligned tokens have the same value, use it.
+                output[token.i] = values[0]
            else:
-                values = gold_values[align[token.i].dataXd]
-                values = values.ravel()
-                if len(values) == 0:
-                    output[token.i] = None
-                elif len(values) == 1:
-                    output[token.i] = values[0]
-                elif len(set(list(values))) == 1:
-                    # If all aligned tokens have the same value, use it.
-                    output[token.i] = values[0]
-                else:
-                    output[token.i] = None
+                output[token.i] = None
        if as_string and field not in ["ENT_IOB", "SENT_START"]:
            output = [vocab.strings[o] if o is not None else o for o in output]
        return output
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -34,7 +34,11 @@ only.
 Predictions will be saved to `doc.cats` as a dictionary, where the key is the
 name of the category and the value is a score between 0 and 1 (inclusive). For
 `textcat` (exclusive categories), the scores will sum to 1, while for
-`textcat_multilabel` there is no particular guarantee about their sum.
+`textcat_multilabel` there is no particular guarantee about their sum. This also
+means that for `textcat`, missing values are equated to a value of 0 (i.e.
+`False`) and are counted as such towards the loss and scoring metrics. This is
+not the case for `textcat_multilabel`, where missing values in the gold standard
+data do not influence the loss or accuracy calculations.

 Note that when assigning values to create training data, the score of each
 category must be 0 or 1. Using other values, for example to create a document