Revert "Bump sudachipy version (#9917)" (#10071)

This reverts commit 58bdd8607b.
2025-10-30 07:27:28 +03:00 · 2022-01-17 10:38:37 +01:00 · 2022-01-17 10:38:37 +01:00 · add52935ff
commit add52935ff
parent 6a8619dd73
10 changed files with 162 additions and 624 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -108,8 +108,8 @@ apple =
    thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
-    sudachipy>=0.5.2,!=0.6.1
+    sudachipy>=0.4.9
-    sudachidict_core>=20211220
+    sudachidict_core>=20200330
 ko =
    natto-py==0.9.0
 th =
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -45,10 +45,6 @@ _hangul_syllables = r"\uAC00-\uD7AF"
 _hangul_jamo = r"\u1100-\u11FF"
 _hangul = _hangul_syllables + _hangul_jamo
 _hiragana = r"\u3040-\u309F"
 _katakana = r"\u30A0-\u30FFー"
 _kana = _hiragana + _katakana
 # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
 _latin_u_extendedA = (
    r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@ -248,7 +244,6 @@ _uncased = (
    + _tamil
    + _telugu
    + _hangul
    + _kana
    + _cjk
 )
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@ -6,35 +6,16 @@ from ...tokens import Doc, Span
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """
+    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
-    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    # fmt: off
-    """
+    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
-    labels = [
+    # fmt: on
        "nsubj",
        "nsubj:pass",
        "obj",
        "obl",
        "obl:agent",
        "obl:arg",
        "obl:mod",
        "nmod",
        "pcomp",
        "appos",
        "ROOT",
    ]
    post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
    doc = doclike.doc  # Ensure works on both Doc and Span.
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
-    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_deps = [doc.vocab.strings[label] for label in labels]
-    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
+    conj = doc.vocab.strings.add("conj")
    np_label = doc.vocab.strings.add("NP")
    adj_label = doc.vocab.strings.add("amod")
    det_label = doc.vocab.strings.add("det")
    det_pos = doc.vocab.strings.add("DET")
    adp_pos = doc.vocab.strings.add("ADP")
    conj_label = doc.vocab.strings.add("conj")
    conj_pos = doc.vocab.strings.add("CCONJ")
    prev_end = -1
    for i, word in enumerate(doclike):
        if word.pos not in (NOUN, PROPN, PRON):
@ -43,45 +24,16 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
        if word.left_edge.i <= prev_end:
            continue
        if word.dep in np_deps:
-            right_childs = list(word.rights)
+            prev_end = word.right_edge.i
-            right_child = right_childs[0] if right_childs else None
+            yield word.left_edge.i, word.right_edge.i + 1, np_label
-
+        elif word.dep == conj:
            if right_child:
                if (
                    right_child.dep == adj_label
                ):  # allow chain of adjectives by expanding to right
                    right_end = right_child.right_edge
                elif (
                    right_child.dep == det_label and right_child.pos == det_pos
                ):  # cut relative pronouns here
                    right_end = right_child
                elif right_child.dep in np_modifs:  # Check if we can expand to right
                    right_end = word.right_edge
                else:
                    right_end = word
            else:
                right_end = word
            prev_end = right_end.i
            left_index = word.left_edge.i
            left_index = (
                left_index + 1 if word.left_edge.pos == adp_pos else left_index
            )
            yield left_index, right_end.i + 1, np_label
        elif word.dep == conj_label:
            head = word.head
-            while head.dep == conj_label and head.head.i < head.i:
+            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
-                prev_end = word.i
+                prev_end = word.right_edge.i
-
+                yield word.left_edge.i, word.right_edge.i + 1, np_label
                left_index = word.left_edge.i  # eliminate left attached conjunction
                left_index = (
                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
                )
                yield left_index, word.i + 1, np_label
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -6,15 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults
 from .lemmatizer import ItalianLemmatizer
 from .syntax_iterators import SYNTAX_ITERATORS
 class ItalianDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
 class Italian(Language):
--- a/spacy/lang/it/syntax_iterators.py
+++ b/spacy/lang/it/syntax_iterators.py
@ -1,86 +0,0 @@
 from typing import Union, Iterator, Tuple
 from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
 from ...tokens import Doc, Span
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
    """
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
    """
    labels = [
        "nsubj",
        "nsubj:pass",
        "obj",
        "obl",
        "obl:agent",
        "nmod",
        "pcomp",
        "appos",
        "ROOT",
    ]
    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
    dets = ["det", "det:poss"]
    doc = doclike.doc  # Ensure works on both Doc and Span.
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = {doc.vocab.strings.add(label) for label in labels}
    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
    np_label = doc.vocab.strings.add("NP")
    adj_label = doc.vocab.strings.add("amod")
    det_labels = {doc.vocab.strings.add(det) for det in dets}
    det_pos = doc.vocab.strings.add("DET")
    adp_label = doc.vocab.strings.add("ADP")
    conj = doc.vocab.strings.add("conj")
    conj_pos = doc.vocab.strings.add("CCONJ")
    prev_end = -1
    for i, word in enumerate(doclike):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        # Prevent nested chunks from being produced
        if word.left_edge.i <= prev_end:
            continue
        if word.dep in np_deps:
            right_childs = list(word.rights)
            right_child = right_childs[0] if right_childs else None
            if right_child:
                if (
                    right_child.dep == adj_label
                ):  # allow chain of adjectives by expanding to right
                    right_end = right_child.right_edge
                elif (
                    right_child.dep in det_labels and right_child.pos == det_pos
                ):  # cut relative pronouns here
                    right_end = right_child
                elif right_child.dep in np_modifs:  # Check if we can expand to right
                    right_end = word.right_edge
                else:
                    right_end = word
            else:
                right_end = word
            prev_end = right_end.i
            left_index = word.left_edge.i
            left_index = (
                left_index + 1 if word.left_edge.pos == adp_label else left_index
            )
            yield left_index, right_end.i + 1, np_label
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                prev_end = word.i
                left_index = word.left_edge.i  # eliminate left attached conjunction
                left_index = (
                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
                )
                yield left_index, word.i + 1, np_label
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/nb/stop_words.py
+++ b/spacy/lang/nb/stop_words.py
@ -4,42 +4,46 @@ alle allerede alt and andre annen annet at av
 bak bare bedre beste blant ble bli blir blitt bris by både
-da dag de del dem den denne der dermed det dette disse du
+da dag de del dem den denne der dermed det dette disse drept du
 eller en enn er et ett etter
-fem fikk fire fjor flere folk for fortsatt fra fram
+fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
 funnet få får fått før først første
 gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
-ha hadde ham han hans har hele helt henne hennes her hun
+ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
 hvorfor
 i ifølge igjen ikke ingen inn
 ja jeg
 kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
 kvinner
-la laget land landet langt leder ligger like litt løpet
+la laget land landet langt leder ligger like litt løpet lørdag
-man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte
+man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
 millioner minutter mot msci mye må mål måtte
-ned neste noe noen nok ny nye nå når
+ned neste noe noen nok norge norsk norske ntb ny nye nå når
-og også om opp opplyser oss over
+og også om onsdag opp opplyser oslo oss over
-personer plass poeng på
+personer plass poeng politidistrikt politiet president prosent på
-runde rundt
+regjeringen runde rundt russland
-sa saken samme sammen samtidig satt se seg seks selv senere ser sett
+sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
 siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
-store står svært så
+store står sverige svært så søndag
-ta tatt tid tidligere til tilbake tillegg tok tror
+ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
 tyskland
-under ut uten utenfor
+under usa ut uten utenfor
 vant var ved veldig vi videre viktig vil ville viser vår være vært
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@ -1,10 +1,13 @@
 # Source: https://github.com/stopwords-iso/stopwords-sl
-# Removed various words that are not normally considered stop words, such as months.
+# TODO: probably needs to be tidied up – the list seems to have month names in
 # it, which shouldn't be considered stop words.
 STOP_WORDS = set(
    """
 a
 ali
 april
 avgust
 b
 bi
 bil
@ -16,6 +19,7 @@ biti
 blizu
 bo
 bodo
 bojo
 bolj
 bom
 bomo
@ -33,6 +37,16 @@ da
 daleč
 dan
 danes
 datum
 december
 deset
 deseta
 deseti
 deseto
 devet
 deveta
 deveti
 deveto
 do
 dober
 dobra
@ -40,7 +54,16 @@ dobri
 dobro
 dokler
 dol
 dolg
 dolga
 dolgi
 dovolj
 drug
 druga
 drugi
 drugo
 dva
 dve
 e
 eden
 en
@ -51,6 +74,7 @@ enkrat
 eno
 etc.
 f
 februar
 g
 g.
 ga
@ -69,12 +93,16 @@ iv
 ix
 iz
 j
 januar
 jaz
 je
 ji
 jih
 jim
 jo
 julij
 junij
 jutri
 k
 kadarkoli
 kaj
@ -95,23 +123,41 @@ kje
 kjer
 kjerkoli
 ko
 koder
 koderkoli
 koga
 komu
 kot
 kratek
 kratka
 kratke
 kratki
 l
 lahka
 lahke
 lahki
 lahko
 le
 lep
 lepa
 lepe
 lepi
 lepo
 leto
 m
 maj
 majhen
 majhna
 majhni
 malce
 malo
 manj
 marec
 me
 med
 medtem
 mene
 mesec
 mi
 midva
 midve
@ -137,6 +183,7 @@ najmanj
 naju
 največ
 nam
 narobe
 nas
 nato
 nazaj
@ -145,6 +192,7 @@ naša
 naše
 ne
 nedavno
 nedelja
 nek
 neka
 nekaj
@ -188,6 +236,7 @@ njuna
 njuno
 no
 nocoj
 november
 npr.
 o
 ob
@ -195,23 +244,51 @@ oba
 obe
 oboje
 od
 odprt
 odprta
 odprti
 okoli
 oktober
 on
 onadva
 one
 oni
 onidve
 osem
 osma
 osmi
 osmo
 oz.
 p
 pa
 pet
 peta
 petek
 peti
 peto
 po
 pod
 pogosto
 poleg
 poln
 polna
 polni
 polno
 ponavadi
 ponedeljek
 ponovno
 potem
 povsod
 pozdravljen
 pozdravljeni
 prav
 prava
 prave
 pravi
 pravo
 prazen
 prazna
 prazno
 prbl.
 precej
 pred
@ -220,10 +297,19 @@ preko
 pri
 pribl.
 približno
 primer
 pripravljen
 pripravljena
 pripravljeni
 proti
 prva
 prvi
 prvo
 r
 ravno
 redko
 res
 reč
 s
 saj
 sam
@ -235,17 +321,29 @@ se
 sebe
 sebi
 sedaj
 sedem
 sedma
 sedmi
 sedmo
 sem
 september
 seveda
 si
 sicer
 skoraj
 skozi
 slab
 smo
 so
 sobota
 spet
 sreda
 srednja
 srednji
 sta
 ste
 stran
 stvar
 sva
 t
 ta
@ -260,6 +358,10 @@ te
 tebe
 tebi
 tega
 težak
 težka
 težki
 težko
 ti
 tista
 tiste
@ -269,6 +371,11 @@ tj.
 tja
 to
 toda
 torek
 tretja
 tretje
 tretji
 tri
 tu
 tudi
 tukaj
@ -285,6 +392,10 @@ vaša
 vaše
 ve
 vedno
 velik
 velika
 veliki
 veliko
 vendar
 ves
 več
@ -292,6 +403,10 @@ vi
 vidva
 vii
 viii
 visok
 visoka
 visoke
 visoki
 vsa
 vsaj
 vsak
@ -305,21 +420,34 @@ vsega
 vsi
 vso
 včasih
 včeraj
 x
 z
 za
 zadaj
 zadnji
 zakaj
 zaprta
 zaprti
 zaprto
 zdaj
 zelo
 zunaj
 č
 če
 često
 četrta
 četrtek
 četrti
 četrto
 čez
 čigav
 š
 šest
 šesta
 šesti
 šesto
 štiri
 ž
 že
 """.split()
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -155,11 +155,6 @@ def fr_tokenizer():
    return get_lang_class("fr")().tokenizer
@pytest.fixture(scope="session")
 def fr_vocab():
    return get_lang_class("fr")().vocab
@pytest.fixture(scope="session")
 def ga_tokenizer():
    return get_lang_class("ga")().tokenizer
@ -210,11 +205,6 @@ def it_tokenizer():
    return get_lang_class("it")().tokenizer
@pytest.fixture(scope="session")
 def it_vocab():
    return get_lang_class("it")().vocab
@pytest.fixture(scope="session")
 def ja_tokenizer():
    pytest.importorskip("sudachipy")
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@ -1,230 +1,8 @@
 from spacy.tokens import Doc
 import pytest
 # fmt: off
@pytest.mark.parametrize(
    "words,heads,deps,pos,chunk_offsets",
    [
        # determiner + noun
        # un nom -> un nom
        (
            ["un", "nom"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0, 2)],
        ),
        # determiner + noun starting with vowel
        # l'heure -> l'heure
        (
            ["l'", "heure"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0, 2)],
        ),
        # determiner + plural noun
        # les romans -> les romans
        (
            ["les", "romans"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0, 2)],
        ),
        # det + adj + noun
        # Le vieux Londres  -> Le vieux Londres 
        (
            ['Les', 'vieux', 'Londres'],
            [2, 2, 2],
            ["det", "amod", "ROOT"],
            ["DET", "ADJ", "NOUN"],
            [(0,3)]
        ),
        # det + noun + adj
        # le nom propre  -> le nom propre   a proper noun
        (
            ["le", "nom", "propre"],
            [1, 1, 1],
            ["det", "ROOT", "amod"],
            ["DET", "NOUN", "ADJ"],
            [(0, 3)],
        ),
        # det + noun + adj plural
        # Les chiens bruns  -> les chiens bruns
        (
            ["Les", "chiens", "bruns"],
            [1, 1, 1],
            ["det", "ROOT", "amod"],
            ["DET", "NOUN", "ADJ"],
            [(0, 3)],
        ),
        # multiple adjectives: one adj before the noun, one adj after the noun
        # un nouveau film intéressant -> un nouveau film intéressant
        (
            ["un", "nouveau", "film", "intéressant"],
            [2, 2, 2, 2],
            ["det", "amod", "ROOT", "amod"],
            ["DET", "ADJ", "NOUN", "ADJ"],
            [(0,4)]
        ),
        # multiple adjectives, both adjs after the noun
        # une personne intelligente et drôle -> une personne intelligente et drôle
        (
            ["une", "personne", "intelligente", "et", "drôle"],
            [1, 1, 1, 4, 2],
            ["det", "ROOT", "amod", "cc", "conj"],
            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
            [(0,5)]
        ),
        # relative pronoun
        # un bus qui va au ville -> un bus, qui, ville
        (
            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
            [1, 1, 3, 1, 5, 3],
            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
            [(0,2), (2,3), (5,6)]
        ),
        # relative subclause
        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
        (
            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
            [0, 2, 0, 5, 5, 2, 5],
            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
            [(1,3), (4,5)]
        ),
        # Person name and title by flat
        # Louis XIV -> Louis XIV
        (
            ["Louis", "XIV"],
            [0, 0],
            ["ROOT", "flat:name"],
            ["PROPN", "PROPN"],
            [(0,2)]
        ),
        # Organization name by flat
        # Nations Unies -> Nations Unies
        (
            ["Nations", "Unies"],
            [0, 0],
            ["ROOT", "flat:name"],
            ["PROPN", "PROPN"],
            [(0,2)]
        ),
        # Noun compound, person name created by two flats
        # Louise de Bratagne -> Louise de Bratagne
        (
            ["Louise", "de", "Bratagne"],
            [0, 0, 0],
            ["ROOT", "flat:name", "flat:name"],
            ["PROPN", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # Noun compound, person name created by two flats
        # Louis François Joseph -> Louis François Joseph
        (
            ["Louis", "François", "Joseph"],
            [0, 0, 0],
            ["ROOT", "flat:name", "flat:name"],
            ["PROPN", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # one determiner + one noun + one adjective qualified by an adverb
        # quelques agriculteurs très riches -> quelques agriculteurs très riches
        (
            ["quelques", "agriculteurs", "très", "riches"],
            [1, 1, 3, 1],
            ['det', 'ROOT', 'advmod', 'amod'],
            ['DET', 'NOUN', 'ADV', 'ADJ'],
            [(0,4)]
        ),
        # Two NPs conjuncted
        # Il a un chien et un chat -> Il, un chien, un chat
        ( 
            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
            [1, 1, 3, 1, 6, 6, 3],
            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
            [(0,1), (2,4), (5,7)]
        ),
        # Two NPs together
        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
        (
            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
            [1, 1, 1, 1, 3],
            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
            [(0, 3), (3, 5)]
        ),
        # nmod relation between NPs
        # la destruction de la ville -> la destruction, la ville
        (
            ['la', 'destruction', 'de', 'la', 'ville'],
            [1, 1, 4, 4, 1],
            ['det', 'ROOT', 'case', 'det', 'nmod'],
            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
            [(0,2), (3,5)]
        ),
        # nmod relation between NPs
        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
        (
            ['Archiduchesse', 'd’', 'Autriche'],
            [0, 2, 0],
            ['ROOT', 'case', 'nmod'],
            ['NOUN', 'ADP', 'PROPN'],
            [(0,1), (2,3)]
        ),
        # Compounding by nmod, several NPs chained together
        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
        (
            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
            [2, 2, 2, 4, 2, 6, 2],
            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
            [(0, 3), (4, 5), (6, 7)]
        ),
        # several NPs
        # Traduction du rapport de Susana -> Traduction, rapport, Susana
        (
            ['Traduction', 'du', 'raport', 'de', 'Susana'],
            [0, 2, 0, 4, 2],
            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
            [(0,1), (2,3), (4,5)]  
        ),
        # Several NPs
        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
        (  
            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
            [2, 2, 2, 4, 2, 7, 7, 2],
            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
            [(0,3), (4,5), (6,8)]
        ),
        # Passive subject
        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
        (
            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
            [(0, 3), (6, 10), (11, 12)]
        )
    ],
 )
 # fmt: on
 def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
-    doc = fr_tokenizer("Je suis allé à l'école")
+    doc = fr_tokenizer("trouver des travaux antérieurs")
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/it/test_noun_chunks.py
+++ b/spacy/tests/lang/it/test_noun_chunks.py
@ -1,221 +0,0 @@
 from spacy.tokens import Doc
 import pytest
 # fmt: off
@pytest.mark.parametrize(
    "words,heads,deps,pos,chunk_offsets",
    [
        # determiner + noun
        # un pollo -> un pollo
        (
            ["un", "pollo"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0,2)],
        ),
        # two determiners + noun
        # il mio cane -> il mio cane
        (
            ["il", "mio", "cane"],
            [2, 2, 2],
            ["det", "det:poss", "ROOT"],
            ["DET", "DET", "NOUN"],
            [(0,3)],
        ),
        # two determiners, one is after noun. rare usage but still testing
        # il cane mio-> il cane mio
        (
            ["il", "cane", "mio"],
            [1, 1, 1],
            ["det", "ROOT", "det:poss"],
            ["DET", "NOUN", "DET"],
            [(0,3)],
        ),
        # relative pronoun
        # È molto bello il vestito che hai acquistat -> il vestito, che   the dress that you bought is very pretty.
        (
            ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
            [2, 2, 2, 4, 2, 7, 7, 4],
            ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
            ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
            [(3,5), (5,6)]
        ),
        # relative subclause
        # il computer che hai comprato -> il computer, che     the computer that you bought
        (
            ['il', 'computer', 'che', 'hai', 'comprato'],
            [1, 1, 4, 4, 1],
            ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
            ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
            [(0,2), (2,3)]
        ),
        # det + noun + adj
        # Una macchina grande  -> Una macchina grande
        (
            ["Una", "macchina", "grande"],
            [1, 1, 1],
            ["det", "ROOT", "amod"],
            ["DET", "NOUN", "ADJ"],
            [(0,3)],
        ),
        # noun + adj plural
        # mucche bianche 
        (
            ["mucche", "bianche"],
            [0, 0],
            ["ROOT", "amod"],
            ["NOUN", "ADJ"],
            [(0,2)],
        ),
        # det + adj + noun
        # Una grande macchina -> Una grande macchina
        (
            ['Una', 'grande', 'macchina'],
            [2, 2, 2],
            ["det", "amod", "ROOT"],
            ["DET", "ADJ", "NOUN"],
            [(0,3)]
        ),
        # det + adj + noun, det with apostrophe
        # un'importante associazione -> un'importante associazione
        (
            ["Un'", 'importante', 'associazione'],
            [2, 2, 2],
            ["det", "amod", "ROOT"],
            ["DET", "ADJ", "NOUN"],
            [(0,3)]
        ),
        # multiple adjectives
        # Un cane piccolo e marrone -> Un cane piccolo e marrone
        (
            ["Un", "cane", "piccolo", "e", "marrone"],
            [1, 1, 1, 4, 2],
            ["det", "ROOT", "amod", "cc", "conj"],
            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
            [(0,5)]
        ),
        # determiner, adjective, compound created by flat
        # le Nazioni Unite -> le Nazioni Unite
        (
            ["le", "Nazioni", "Unite"],
            [1, 1, 1],
            ["det", "ROOT", "flat:name"],
            ["DET", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # one determiner + one noun + one adjective qualified by an adverb
        # alcuni contadini molto ricchi -> alcuni contadini molto ricchi     some very rich farmers
        (
            ['alcuni', 'contadini', 'molto', 'ricchi'],
            [1, 1, 3, 1],
            ['det', 'ROOT', 'advmod', 'amod'],
            ['DET', 'NOUN', 'ADV', 'ADJ'],
            [(0,4)]
        ),
        # Two NPs conjuncted
        # Ho un cane e un gatto -> un cane, un gatto
        ( 
            ['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
            [0, 2, 0, 5, 5, 0],
            ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
            ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
            [(1,3), (4,6)]
        ),
        # Two NPs together
        # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
        (
            ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
            [1, 1, 1, 1, 3],
            ['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
            [(0, 3), (3, 5)]
        ),
        # Noun compound, person name and titles
        # Dom Pedro II -> Dom Pedro II
        (
            ["Dom", "Pedro", "II"],
            [0, 0, 0],
            ["ROOT", "flat:name", "flat:name"],
            ["PROPN", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # Noun compound created by flat
        # gli Stati Uniti
        (
            ["gli", "Stati", "Uniti"],
            [1, 1, 1],
            ["det", "ROOT", "flat:name"],
            ["DET", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # nmod relation between NPs
        # la distruzione della città -> la distruzione, città
        (
            ['la', 'distruzione', 'della', 'città'],
            [1, 1, 3, 1],
            ['det', 'ROOT', 'case', 'nmod'],
            ['DET', 'NOUN', 'ADP', 'NOUN'],
            [(0,2), (3,4)]
        ),
        # Compounding by nmod, several NPs chained together
        # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
        (
            ["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
            [2, 2, 2, 4, 2, 6, 2],
            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
            [(0, 3), (4, 5), (6, 7)]
        ),
        # several NPs
        # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
        (
            ['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
            [0, 2, 0, 4, 2],
            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
            [(0,1), (2,3), (4,5)]  
        ),
        # Several NPs
        # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
        (  
            ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
            [1, 1, 1, 4, 1, 8, 8, 8, 1],
            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
            [(0,3), (4,5), (6,9)]
        ),
        # Passive subject
        # La nuova spesa è alimentata dal grande conto in banca di Clinton  -> Le nuova spesa, grande conto, banca, Clinton
        (
            ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
            [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
            [(0, 3), (6, 8), (9, 10), (11,12)]
        ),
        # Misc
        # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
        (
            ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
            [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
            ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
            ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
            [(2,4), (9,12), (13,14), (17,18), (19,20)]
        )
    ],
 )
 # fmt: on
 def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
    doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
 def test_noun_chunks_is_parsed_it(it_tokenizer):
    """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
    doc = it_tokenizer("Sei andato a Oxford")
    with pytest.raises(ValueError):
        list(doc.noun_chunks)