Revert "Bump sudachipy version (#9917)" (#10071)

This reverts commit 58bdd8607b.
This commit is contained in:
Adriane Boyd 2022-01-17 10:38:37 +01:00 committed by GitHub
parent 6a8619dd73
commit add52935ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 162 additions and 624 deletions

View File

@ -108,8 +108,8 @@ apple =
thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies
ja =
sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20211220
sudachipy>=0.4.9
sudachidict_core>=20200330
ko =
natto-py==0.9.0
th =

View File

@ -45,10 +45,6 @@ _hangul_syllables = r"\uAC00-\uD7AF"
_hangul_jamo = r"\u1100-\u11FF"
_hangul = _hangul_syllables + _hangul_jamo
_hiragana = r"\u3040-\u309F"
_katakana = r"\u30A0-\u30FFー"
_kana = _hiragana + _katakana
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
_latin_u_extendedA = (
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@ -248,7 +244,6 @@ _uncased = (
+ _tamil
+ _telugu
+ _hangul
+ _kana
+ _cjk
)

View File

@ -6,35 +6,16 @@ from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"obl:arg",
"obl:mod",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_label = doc.vocab.strings.add("det")
det_pos = doc.vocab.strings.add("DET")
adp_pos = doc.vocab.strings.add("ADP")
conj_label = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
@ -43,45 +24,16 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep == det_label and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_pos else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj_label:
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj_label and head.head.i < head.i:
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -6,15 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ...language import Language, BaseDefaults
from .lemmatizer import ItalianLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS
class ItalianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
class Italian(Language):

View File

@ -1,86 +0,0 @@
from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
dets = ["det", "det:poss"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_labels = {doc.vocab.strings.add(det) for det in dets}
det_pos = doc.vocab.strings.add("DET")
adp_label = doc.vocab.strings.add("ADP")
conj = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep in det_labels and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_label else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -4,42 +4,46 @@ alle allerede alt and andre annen annet at av
bak bare bedre beste blant ble bli blir blitt bris by både
da dag de del dem den denne der dermed det dette disse du
da dag de del dem den denne der dermed det dette disse drept du
eller en enn er et ett etter
fem fikk fire fjor flere folk for fortsatt fra fram
fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
funnet får fått før først første
gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn går
ha hadde ham han hans har hele helt henne hennes her hun
ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
hvorfor
i ifølge igjen ikke ingen inn
ja jeg
kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
kvinner
la laget land landet langt leder ligger like litt løpet
la laget land landet langt leder ligger like litt løpet lørdag
man mange med meg mellom men mener mennesker mens mer mot mye mål måtte
man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
millioner minutter mot msci mye mål måtte
ned neste noe noen nok ny nye når
ned neste noe noen nok norge norsk norske ntb ny nye når
og også om opp opplyser oss over
og også om onsdag opp opplyser oslo oss over
personer plass poeng på
personer plass poeng politidistrikt politiet president prosent på
runde rundt
regjeringen runde rundt russland
sa saken samme sammen samtidig satt se seg seks selv senere ser sett
sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
store står svært
store står sverige svært søndag
ta tatt tid tidligere til tilbake tillegg tok tror
ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
tyskland
under ut uten utenfor
under usa ut uten utenfor
vant var ved veldig vi videre viktig vil ville viser vår være vært

View File

@ -1,10 +1,13 @@
# Source: https://github.com/stopwords-iso/stopwords-sl
# Removed various words that are not normally considered stop words, such as months.
# TODO: probably needs to be tidied up the list seems to have month names in
# it, which shouldn't be considered stop words.
STOP_WORDS = set(
"""
a
ali
april
avgust
b
bi
bil
@ -16,6 +19,7 @@ biti
blizu
bo
bodo
bojo
bolj
bom
bomo
@ -33,6 +37,16 @@ da
daleč
dan
danes
datum
december
deset
deseta
deseti
deseto
devet
deveta
deveti
deveto
do
dober
dobra
@ -40,7 +54,16 @@ dobri
dobro
dokler
dol
dolg
dolga
dolgi
dovolj
drug
druga
drugi
drugo
dva
dve
e
eden
en
@ -51,6 +74,7 @@ enkrat
eno
etc.
f
februar
g
g.
ga
@ -69,12 +93,16 @@ iv
ix
iz
j
januar
jaz
je
ji
jih
jim
jo
julij
junij
jutri
k
kadarkoli
kaj
@ -95,23 +123,41 @@ kje
kjer
kjerkoli
ko
koder
koderkoli
koga
komu
kot
kratek
kratka
kratke
kratki
l
lahka
lahke
lahki
lahko
le
lep
lepa
lepe
lepi
lepo
leto
m
maj
majhen
majhna
majhni
malce
malo
manj
marec
me
med
medtem
mene
mesec
mi
midva
midve
@ -137,6 +183,7 @@ najmanj
naju
največ
nam
narobe
nas
nato
nazaj
@ -145,6 +192,7 @@ naša
naše
ne
nedavno
nedelja
nek
neka
nekaj
@ -188,6 +236,7 @@ njuna
njuno
no
nocoj
november
npr.
o
ob
@ -195,23 +244,51 @@ oba
obe
oboje
od
odprt
odprta
odprti
okoli
oktober
on
onadva
one
oni
onidve
osem
osma
osmi
osmo
oz.
p
pa
pet
peta
petek
peti
peto
po
pod
pogosto
poleg
poln
polna
polni
polno
ponavadi
ponedeljek
ponovno
potem
povsod
pozdravljen
pozdravljeni
prav
prava
prave
pravi
pravo
prazen
prazna
prazno
prbl.
precej
pred
@ -220,10 +297,19 @@ preko
pri
pribl.
približno
primer
pripravljen
pripravljena
pripravljeni
proti
prva
prvi
prvo
r
ravno
redko
res
reč
s
saj
sam
@ -235,17 +321,29 @@ se
sebe
sebi
sedaj
sedem
sedma
sedmi
sedmo
sem
september
seveda
si
sicer
skoraj
skozi
slab
smo
so
sobota
spet
sreda
srednja
srednji
sta
ste
stran
stvar
sva
t
ta
@ -260,6 +358,10 @@ te
tebe
tebi
tega
težak
težka
težki
težko
ti
tista
tiste
@ -269,6 +371,11 @@ tj.
tja
to
toda
torek
tretja
tretje
tretji
tri
tu
tudi
tukaj
@ -285,6 +392,10 @@ vaša
vaše
ve
vedno
velik
velika
veliki
veliko
vendar
ves
več
@ -292,6 +403,10 @@ vi
vidva
vii
viii
visok
visoka
visoke
visoki
vsa
vsaj
vsak
@ -305,21 +420,34 @@ vsega
vsi
vso
včasih
včeraj
x
z
za
zadaj
zadnji
zakaj
zaprta
zaprti
zaprto
zdaj
zelo
zunaj
č
če
često
četrta
četrtek
četrti
četrto
čez
čigav
š
šest
šesta
šesti
šesto
štiri
ž
že
""".split()

View File

@ -155,11 +155,6 @@ def fr_tokenizer():
return get_lang_class("fr")().tokenizer
@pytest.fixture(scope="session")
def fr_vocab():
return get_lang_class("fr")().vocab
@pytest.fixture(scope="session")
def ga_tokenizer():
return get_lang_class("ga")().tokenizer
@ -210,11 +205,6 @@ def it_tokenizer():
return get_lang_class("it")().tokenizer
@pytest.fixture(scope="session")
def it_vocab():
return get_lang_class("it")().vocab
@pytest.fixture(scope="session")
def ja_tokenizer():
pytest.importorskip("sudachipy")

View File

@ -1,230 +1,8 @@
from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un nom -> un nom
(
["un", "nom"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + noun starting with vowel
# l'heure -> l'heure
(
["l'", "heure"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + plural noun
# les romans -> les romans
(
["les", "romans"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# det + adj + noun
# Le vieux Londres -> Le vieux Londres
(
['Les', 'vieux', 'Londres'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + noun + adj
# le nom propre -> le nom propre a proper noun
(
["le", "nom", "propre"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# det + noun + adj plural
# Les chiens bruns -> les chiens bruns
(
["Les", "chiens", "bruns"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# multiple adjectives: one adj before the noun, one adj after the noun
# un nouveau film intéressant -> un nouveau film intéressant
(
["un", "nouveau", "film", "intéressant"],
[2, 2, 2, 2],
["det", "amod", "ROOT", "amod"],
["DET", "ADJ", "NOUN", "ADJ"],
[(0,4)]
),
# multiple adjectives, both adjs after the noun
# une personne intelligente et drôle -> une personne intelligente et drôle
(
["une", "personne", "intelligente", "et", "drôle"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# relative pronoun
# un bus qui va au ville -> un bus, qui, ville
(
['un', 'bus', 'qui', 'va', 'au', 'ville'],
[1, 1, 3, 1, 5, 3],
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
[(0,2), (2,3), (5,6)]
),
# relative subclause
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
(
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
[0, 2, 0, 5, 5, 2, 5],
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
[(1,3), (4,5)]
),
# Person name and title by flat
# Louis XIV -> Louis XIV
(
["Louis", "XIV"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Organization name by flat
# Nations Unies -> Nations Unies
(
["Nations", "Unies"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Noun compound, person name created by two flats
# Louise de Bratagne -> Louise de Bratagne
(
["Louise", "de", "Bratagne"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound, person name created by two flats
# Louis François Joseph -> Louis François Joseph
(
["Louis", "François", "Joseph"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# quelques agriculteurs très riches -> quelques agriculteurs très riches
(
["quelques", "agriculteurs", "très", "riches"],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Il a un chien et un chat -> Il, un chien, un chat
(
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
[1, 1, 3, 1, 6, 6, 3],
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(0,1), (2,4), (5,7)]
),
# Two NPs together
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
(
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# nmod relation between NPs
# la destruction de la ville -> la destruction, la ville
(
['la', 'destruction', 'de', 'la', 'ville'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'case', 'det', 'nmod'],
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
[(0,2), (3,5)]
),
# nmod relation between NPs
# Archiduchesse dAutriche -> Archiduchesse, Autriche
(
['Archiduchesse', 'd', 'Autriche'],
[0, 2, 0],
['ROOT', 'case', 'nmod'],
['NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3)]
),
# Compounding by nmod, several NPs chained together
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
(
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduction du rapport de Susana -> Traduction, rapport, Susana
(
['Traduction', 'du', 'raport', 'de', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
(
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
[2, 2, 2, 4, 2, 7, 7, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
[(0,3), (4,5), (6,8)]
),
# Passive subject
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
(
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
[(0, 3), (6, 10), (11, 12)]
)
],
)
# fmt: on
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
doc = fr_tokenizer("Je suis allé à l'école")
doc = fr_tokenizer("trouver des travaux antérieurs")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -1,221 +0,0 @@
from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un pollo -> un pollo
(
["un", "pollo"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0,2)],
),
# two determiners + noun
# il mio cane -> il mio cane
(
["il", "mio", "cane"],
[2, 2, 2],
["det", "det:poss", "ROOT"],
["DET", "DET", "NOUN"],
[(0,3)],
),
# two determiners, one is after noun. rare usage but still testing
# il cane mio-> il cane mio
(
["il", "cane", "mio"],
[1, 1, 1],
["det", "ROOT", "det:poss"],
["DET", "NOUN", "DET"],
[(0,3)],
),
# relative pronoun
# È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty.
(
["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
[2, 2, 2, 4, 2, 7, 7, 4],
['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
[(3,5), (5,6)]
),
# relative subclause
# il computer che hai comprato -> il computer, che the computer that you bought
(
['il', 'computer', 'che', 'hai', 'comprato'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
[(0,2), (2,3)]
),
# det + noun + adj
# Una macchina grande -> Una macchina grande
(
["Una", "macchina", "grande"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0,3)],
),
# noun + adj plural
# mucche bianche
(
["mucche", "bianche"],
[0, 0],
["ROOT", "amod"],
["NOUN", "ADJ"],
[(0,2)],
),
# det + adj + noun
# Una grande macchina -> Una grande macchina
(
['Una', 'grande', 'macchina'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + adj + noun, det with apostrophe
# un'importante associazione -> un'importante associazione
(
["Un'", 'importante', 'associazione'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# multiple adjectives
# Un cane piccolo e marrone -> Un cane piccolo e marrone
(
["Un", "cane", "piccolo", "e", "marrone"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# determiner, adjective, compound created by flat
# le Nazioni Unite -> le Nazioni Unite
(
["le", "Nazioni", "Unite"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers
(
['alcuni', 'contadini', 'molto', 'ricchi'],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Ho un cane e un gatto -> un cane, un gatto
(
['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
[0, 2, 0, 5, 5, 0],
['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(1,3), (4,6)]
),
# Two NPs together
# lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
(
['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# Noun compound, person name and titles
# Dom Pedro II -> Dom Pedro II
(
["Dom", "Pedro", "II"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound created by flat
# gli Stati Uniti
(
["gli", "Stati", "Uniti"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# nmod relation between NPs
# la distruzione della città -> la distruzione, città
(
['la', 'distruzione', 'della', 'città'],
[1, 1, 3, 1],
['det', 'ROOT', 'case', 'nmod'],
['DET', 'NOUN', 'ADP', 'NOUN'],
[(0,2), (3,4)]
),
# Compounding by nmod, several NPs chained together
# la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
(
["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
(
['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
(
['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
[1, 1, 1, 4, 1, 8, 8, 8, 1],
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
[(0,3), (4,5), (6,9)]
),
# Passive subject
# La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton
(
['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
[2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0, 3), (6, 8), (9, 10), (11,12)]
),
# Misc
# Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
(
['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
[15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(2,4), (9,12), (13,14), (17,18), (19,20)]
)
],
)
# fmt: on
def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_it(it_tokenizer):
"""Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
doc = it_tokenizer("Sei andato a Oxford")
with pytest.raises(ValueError):
list(doc.noun_chunks)