Revert "Bump sudachipy version (#9917)" (#10071)

This reverts commit 58bdd8607b.
This commit is contained in:
Adriane Boyd 2022-01-17 10:38:37 +01:00 committed by GitHub
parent 6a8619dd73
commit add52935ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 162 additions and 624 deletions

View File

@ -108,8 +108,8 @@ apple =
thinc-apple-ops>=0.0.4,<1.0.0 thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies # Language tokenizers with external dependencies
ja = ja =
sudachipy>=0.5.2,!=0.6.1 sudachipy>=0.4.9
sudachidict_core>=20211220 sudachidict_core>=20200330
ko = ko =
natto-py==0.9.0 natto-py==0.9.0
th = th =

View File

@ -45,10 +45,6 @@ _hangul_syllables = r"\uAC00-\uD7AF"
_hangul_jamo = r"\u1100-\u11FF" _hangul_jamo = r"\u1100-\u11FF"
_hangul = _hangul_syllables + _hangul_jamo _hangul = _hangul_syllables + _hangul_jamo
_hiragana = r"\u3040-\u309F"
_katakana = r"\u30A0-\u30FFー"
_kana = _hiragana + _katakana
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
_latin_u_extendedA = ( _latin_u_extendedA = (
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@ -248,7 +244,6 @@ _uncased = (
+ _tamil + _tamil
+ _telugu + _telugu
+ _hangul + _hangul
+ _kana
+ _cjk + _cjk
) )

View File

@ -6,35 +6,16 @@ from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span. # fmt: off
""" labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
labels = [ # fmt: on
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"obl:arg",
"obl:mod",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"): if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels} np_deps = [doc.vocab.strings[label] for label in labels]
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_label = doc.vocab.strings.add("det")
det_pos = doc.vocab.strings.add("DET")
adp_pos = doc.vocab.strings.add("ADP")
conj_label = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1 prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
@ -43,45 +24,16 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
if word.left_edge.i <= prev_end: if word.left_edge.i <= prev_end:
continue continue
if word.dep in np_deps: if word.dep in np_deps:
right_childs = list(word.rights) prev_end = word.right_edge.i
right_child = right_childs[0] if right_childs else None yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep == det_label and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_pos else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj_label:
head = word.head head = word.head
while head.dep == conj_label and head.head.i < head.i: while head.dep == conj and head.head.i < head.i:
head = head.head head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps: if head.dep in np_deps:
prev_end = word.i prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -6,15 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from .lemmatizer import ItalianLemmatizer from .lemmatizer import ItalianLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS
class ItalianDefaults(BaseDefaults): class ItalianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
class Italian(Language): class Italian(Language):

View File

@ -1,86 +0,0 @@
from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
dets = ["det", "det:poss"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_labels = {doc.vocab.strings.add(det) for det in dets}
det_pos = doc.vocab.strings.add("DET")
adp_label = doc.vocab.strings.add("ADP")
conj = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep in det_labels and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_label else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -4,42 +4,46 @@ alle allerede alt and andre annen annet at av
bak bare bedre beste blant ble bli blir blitt bris by både bak bare bedre beste blant ble bli blir blitt bris by både
da dag de del dem den denne der dermed det dette disse du da dag de del dem den denne der dermed det dette disse drept du
eller en enn er et ett etter eller en enn er et ett etter
fem fikk fire fjor flere folk for fortsatt fra fram fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
funnet får fått før først første funnet får fått før først første
gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn går gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn går
ha hadde ham han hans har hele helt henne hennes her hun ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
hvorfor
i ifølge igjen ikke ingen inn i ifølge igjen ikke ingen inn
ja jeg ja jeg
kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
kvinner
la laget land landet langt leder ligger like litt løpet la laget land landet langt leder ligger like litt løpet lørdag
man mange med meg mellom men mener mennesker mens mer mot mye mål måtte man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
millioner minutter mot msci mye mål måtte
ned neste noe noen nok ny nye når ned neste noe noen nok norge norsk norske ntb ny nye når
og også om opp opplyser oss over og også om onsdag opp opplyser oslo oss over
personer plass poeng på personer plass poeng politidistrikt politiet president prosent på
runde rundt regjeringen runde rundt russland
sa saken samme sammen samtidig satt se seg seks selv senere ser sett sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
store står svært store står sverige svært søndag
ta tatt tid tidligere til tilbake tillegg tok tror ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
tyskland
under ut uten utenfor under usa ut uten utenfor
vant var ved veldig vi videre viktig vil ville viser vår være vært vant var ved veldig vi videre viktig vil ville viser vår være vært

View File

@ -1,10 +1,13 @@
# Source: https://github.com/stopwords-iso/stopwords-sl # Source: https://github.com/stopwords-iso/stopwords-sl
# Removed various words that are not normally considered stop words, such as months. # TODO: probably needs to be tidied up the list seems to have month names in
# it, which shouldn't be considered stop words.
STOP_WORDS = set( STOP_WORDS = set(
""" """
a a
ali ali
april
avgust
b b
bi bi
bil bil
@ -16,6 +19,7 @@ biti
blizu blizu
bo bo
bodo bodo
bojo
bolj bolj
bom bom
bomo bomo
@ -33,6 +37,16 @@ da
daleč daleč
dan dan
danes danes
datum
december
deset
deseta
deseti
deseto
devet
deveta
deveti
deveto
do do
dober dober
dobra dobra
@ -40,7 +54,16 @@ dobri
dobro dobro
dokler dokler
dol dol
dolg
dolga
dolgi
dovolj dovolj
drug
druga
drugi
drugo
dva
dve
e e
eden eden
en en
@ -51,6 +74,7 @@ enkrat
eno eno
etc. etc.
f f
februar
g g
g. g.
ga ga
@ -69,12 +93,16 @@ iv
ix ix
iz iz
j j
januar
jaz jaz
je je
ji ji
jih jih
jim jim
jo jo
julij
junij
jutri
k k
kadarkoli kadarkoli
kaj kaj
@ -95,23 +123,41 @@ kje
kjer kjer
kjerkoli kjerkoli
ko ko
koder
koderkoli koderkoli
koga koga
komu komu
kot kot
kratek
kratka
kratke
kratki
l l
lahka
lahke
lahki
lahko
le le
lep lep
lepa lepa
lepe lepe
lepi lepi
lepo lepo
leto
m m
maj
majhen
majhna
majhni
malce
malo
manj manj
marec
me me
med med
medtem medtem
mene mene
mesec
mi mi
midva midva
midve midve
@ -137,6 +183,7 @@ najmanj
naju naju
največ največ
nam nam
narobe
nas nas
nato nato
nazaj nazaj
@ -145,6 +192,7 @@ naša
naše naše
ne ne
nedavno nedavno
nedelja
nek nek
neka neka
nekaj nekaj
@ -188,6 +236,7 @@ njuna
njuno njuno
no no
nocoj nocoj
november
npr. npr.
o o
ob ob
@ -195,23 +244,51 @@ oba
obe obe
oboje oboje
od od
odprt
odprta
odprti
okoli okoli
oktober
on on
onadva onadva
one one
oni oni
onidve onidve
osem
osma
osmi
osmo
oz. oz.
p p
pa pa
pet
peta
petek
peti
peto
po po
pod pod
pogosto pogosto
poleg poleg
poln
polna
polni
polno
ponavadi ponavadi
ponedeljek
ponovno ponovno
potem potem
povsod povsod
pozdravljen
pozdravljeni
prav
prava
prave
pravi
pravo
prazen
prazna
prazno
prbl. prbl.
precej precej
pred pred
@ -220,10 +297,19 @@ preko
pri pri
pribl. pribl.
približno približno
primer
pripravljen
pripravljena
pripravljeni
proti proti
prva
prvi
prvo
r r
ravno
redko redko
res res
reč
s s
saj saj
sam sam
@ -235,17 +321,29 @@ se
sebe sebe
sebi sebi
sedaj sedaj
sedem
sedma
sedmi
sedmo
sem sem
september
seveda seveda
si si
sicer sicer
skoraj skoraj
skozi skozi
slab
smo smo
so so
sobota
spet spet
sreda
srednja
srednji
sta sta
ste ste
stran
stvar
sva sva
t t
ta ta
@ -260,6 +358,10 @@ te
tebe tebe
tebi tebi
tega tega
težak
težka
težki
težko
ti ti
tista tista
tiste tiste
@ -269,6 +371,11 @@ tj.
tja tja
to to
toda toda
torek
tretja
tretje
tretji
tri
tu tu
tudi tudi
tukaj tukaj
@ -285,6 +392,10 @@ vaša
vaše vaše
ve ve
vedno vedno
velik
velika
veliki
veliko
vendar vendar
ves ves
več več
@ -292,6 +403,10 @@ vi
vidva vidva
vii vii
viii viii
visok
visoka
visoke
visoki
vsa vsa
vsaj vsaj
vsak vsak
@ -305,21 +420,34 @@ vsega
vsi vsi
vso vso
včasih včasih
včeraj
x x
z z
za za
zadaj zadaj
zadnji zadnji
zakaj zakaj
zaprta
zaprti
zaprto
zdaj zdaj
zelo zelo
zunaj zunaj
č č
če če
često često
četrta
četrtek
četrti
četrto
čez čez
čigav čigav
š š
šest
šesta
šesti
šesto
štiri
ž ž
že že
""".split() """.split()

View File

@ -155,11 +155,6 @@ def fr_tokenizer():
return get_lang_class("fr")().tokenizer return get_lang_class("fr")().tokenizer
@pytest.fixture(scope="session")
def fr_vocab():
return get_lang_class("fr")().vocab
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ga_tokenizer(): def ga_tokenizer():
return get_lang_class("ga")().tokenizer return get_lang_class("ga")().tokenizer
@ -210,11 +205,6 @@ def it_tokenizer():
return get_lang_class("it")().tokenizer return get_lang_class("it")().tokenizer
@pytest.fixture(scope="session")
def it_vocab():
return get_lang_class("it")().vocab
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ja_tokenizer(): def ja_tokenizer():
pytest.importorskip("sudachipy") pytest.importorskip("sudachipy")

View File

@ -1,230 +1,8 @@
from spacy.tokens import Doc
import pytest import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un nom -> un nom
(
["un", "nom"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + noun starting with vowel
# l'heure -> l'heure
(
["l'", "heure"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + plural noun
# les romans -> les romans
(
["les", "romans"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# det + adj + noun
# Le vieux Londres -> Le vieux Londres
(
['Les', 'vieux', 'Londres'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + noun + adj
# le nom propre -> le nom propre a proper noun
(
["le", "nom", "propre"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# det + noun + adj plural
# Les chiens bruns -> les chiens bruns
(
["Les", "chiens", "bruns"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# multiple adjectives: one adj before the noun, one adj after the noun
# un nouveau film intéressant -> un nouveau film intéressant
(
["un", "nouveau", "film", "intéressant"],
[2, 2, 2, 2],
["det", "amod", "ROOT", "amod"],
["DET", "ADJ", "NOUN", "ADJ"],
[(0,4)]
),
# multiple adjectives, both adjs after the noun
# une personne intelligente et drôle -> une personne intelligente et drôle
(
["une", "personne", "intelligente", "et", "drôle"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# relative pronoun
# un bus qui va au ville -> un bus, qui, ville
(
['un', 'bus', 'qui', 'va', 'au', 'ville'],
[1, 1, 3, 1, 5, 3],
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
[(0,2), (2,3), (5,6)]
),
# relative subclause
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
(
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
[0, 2, 0, 5, 5, 2, 5],
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
[(1,3), (4,5)]
),
# Person name and title by flat
# Louis XIV -> Louis XIV
(
["Louis", "XIV"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Organization name by flat
# Nations Unies -> Nations Unies
(
["Nations", "Unies"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Noun compound, person name created by two flats
# Louise de Bratagne -> Louise de Bratagne
(
["Louise", "de", "Bratagne"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound, person name created by two flats
# Louis François Joseph -> Louis François Joseph
(
["Louis", "François", "Joseph"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# quelques agriculteurs très riches -> quelques agriculteurs très riches
(
["quelques", "agriculteurs", "très", "riches"],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Il a un chien et un chat -> Il, un chien, un chat
(
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
[1, 1, 3, 1, 6, 6, 3],
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(0,1), (2,4), (5,7)]
),
# Two NPs together
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
(
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# nmod relation between NPs
# la destruction de la ville -> la destruction, la ville
(
['la', 'destruction', 'de', 'la', 'ville'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'case', 'det', 'nmod'],
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
[(0,2), (3,5)]
),
# nmod relation between NPs
# Archiduchesse dAutriche -> Archiduchesse, Autriche
(
['Archiduchesse', 'd', 'Autriche'],
[0, 2, 0],
['ROOT', 'case', 'nmod'],
['NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3)]
),
# Compounding by nmod, several NPs chained together
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
(
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduction du rapport de Susana -> Traduction, rapport, Susana
(
['Traduction', 'du', 'raport', 'de', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
(
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
[2, 2, 2, 4, 2, 7, 7, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
[(0,3), (4,5), (6,8)]
),
# Passive subject
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
(
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
[(0, 3), (6, 10), (11, 12)]
)
],
)
# fmt: on
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_fr(fr_tokenizer): def test_noun_chunks_is_parsed_fr(fr_tokenizer):
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.""" """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
doc = fr_tokenizer("Je suis allé à l'école") doc = fr_tokenizer("trouver des travaux antérieurs")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -1,221 +0,0 @@
from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un pollo -> un pollo
(
["un", "pollo"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0,2)],
),
# two determiners + noun
# il mio cane -> il mio cane
(
["il", "mio", "cane"],
[2, 2, 2],
["det", "det:poss", "ROOT"],
["DET", "DET", "NOUN"],
[(0,3)],
),
# two determiners, one is after noun. rare usage but still testing
# il cane mio-> il cane mio
(
["il", "cane", "mio"],
[1, 1, 1],
["det", "ROOT", "det:poss"],
["DET", "NOUN", "DET"],
[(0,3)],
),
# relative pronoun
# È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty.
(
["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
[2, 2, 2, 4, 2, 7, 7, 4],
['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
[(3,5), (5,6)]
),
# relative subclause
# il computer che hai comprato -> il computer, che the computer that you bought
(
['il', 'computer', 'che', 'hai', 'comprato'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
[(0,2), (2,3)]
),
# det + noun + adj
# Una macchina grande -> Una macchina grande
(
["Una", "macchina", "grande"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0,3)],
),
# noun + adj plural
# mucche bianche
(
["mucche", "bianche"],
[0, 0],
["ROOT", "amod"],
["NOUN", "ADJ"],
[(0,2)],
),
# det + adj + noun
# Una grande macchina -> Una grande macchina
(
['Una', 'grande', 'macchina'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + adj + noun, det with apostrophe
# un'importante associazione -> un'importante associazione
(
["Un'", 'importante', 'associazione'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# multiple adjectives
# Un cane piccolo e marrone -> Un cane piccolo e marrone
(
["Un", "cane", "piccolo", "e", "marrone"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# determiner, adjective, compound created by flat
# le Nazioni Unite -> le Nazioni Unite
(
["le", "Nazioni", "Unite"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers
(
['alcuni', 'contadini', 'molto', 'ricchi'],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Ho un cane e un gatto -> un cane, un gatto
(
['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
[0, 2, 0, 5, 5, 0],
['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(1,3), (4,6)]
),
# Two NPs together
# lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
(
['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# Noun compound, person name and titles
# Dom Pedro II -> Dom Pedro II
(
["Dom", "Pedro", "II"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound created by flat
# gli Stati Uniti
(
["gli", "Stati", "Uniti"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# nmod relation between NPs
# la distruzione della città -> la distruzione, città
(
['la', 'distruzione', 'della', 'città'],
[1, 1, 3, 1],
['det', 'ROOT', 'case', 'nmod'],
['DET', 'NOUN', 'ADP', 'NOUN'],
[(0,2), (3,4)]
),
# Compounding by nmod, several NPs chained together
# la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
(
["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
(
['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
(
['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
[1, 1, 1, 4, 1, 8, 8, 8, 1],
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
[(0,3), (4,5), (6,9)]
),
# Passive subject
# La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton
(
['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
[2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0, 3), (6, 8), (9, 10), (11,12)]
),
# Misc
# Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
(
['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
[15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(2,4), (9,12), (13,14), (17,18), (19,20)]
)
],
)
# fmt: on
def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_it(it_tokenizer):
"""Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
doc = it_tokenizer("Sei andato a Oxford")
with pytest.raises(ValueError):
list(doc.noun_chunks)