Bump sudachipy version (#9917)

* Edited Slovenian stop words list (#9707)

* Noun chunks for Italian (#9662)

* added it vocab

* copied portuguese

* added possessive determiner

* added conjed Nps

* added nmoded Nps

* test misc

* more examples

* fixed typo

* fixed parenth

* fixed comma

* comma fix

* added syntax iters

* fix some index problems

* fixed index

* corrected heads for test case

* fixed tets case

* fixed determiner gender

* cleaned left over

* added example with apostophe

* French NP review (#9667)

* adapted from pt

* added basic tests

* added fr vocab

* fixed noun chunks

* more examples

* typo fix

* changed naming

* changed the naming

* typo fix

* Add Japanese kana characters to default exceptions (fix #9693) (#9742)

This includes the main kana, or phonetic characters, used in Japanese.

There are some supplemental kana blocks in Unicode outside the BMP that
could also be included, but because their actual use is rare I omitted
them for now, but maybe they should be added. The omitted blocks are:

- Kana Supplement
- Kana Extended (A and B)
- Small Kana Extension

* Remove NER words from stop words in Norwegian (#9820)

Default stop words in Norwegian bokmål (nb) in Spacy contain important entities, e.g. France, Germany, Russia, Sweden and USA, police district, important units of time, e.g. months and days of the week, and organisations.

Nobody expects their presence among the default stop words. There is a danger of users complying with the general recommendation of filtering out stop words, while being unaware of filtering out important entities from their data.

See explanation in https://github.com/explosion/spaCy/issues/3052#issuecomment-986756711 and comment https://github.com/explosion/spaCy/issues/3052#issuecomment-986951831

* Bump sudachipy version

* Update sudachipy versions

* Bump versions

Bumping to the most recent dictionary just to keep thing current.
Bumping sudachipy to 5.2 because older versions don't support recent
dictionaries.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Richard Hudson <richard@explosion.ai>
Co-authored-by: Duygu Altinok <duygu@explosion.ai>
Co-authored-by: Haakon Meland Eriksen <haakon.eriksen@far.no>
This commit is contained in:
Paul O'Leary McCann 2022-01-17 16:16:22 +09:00 committed by GitHub
parent a784b12eff
commit 58bdd8607b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 624 additions and 162 deletions

View File

@ -108,8 +108,8 @@ apple =
thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies
ja =
sudachipy>=0.4.9
sudachidict_core>=20200330
sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20211220
ko =
natto-py==0.9.0
th =

View File

@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
_hangul_jamo = r"\u1100-\u11FF"
_hangul = _hangul_syllables + _hangul_jamo
_hiragana = r"\u3040-\u309F"
_katakana = r"\u30A0-\u30FFー"
_kana = _hiragana + _katakana
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
_latin_u_extendedA = (
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@ -244,6 +248,7 @@ _uncased = (
+ _tamil
+ _telugu
+ _hangul
+ _kana
+ _cjk
)

View File

@ -6,16 +6,35 @@ from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"obl:arg",
"obl:mod",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_label = doc.vocab.strings.add("det")
det_pos = doc.vocab.strings.add("DET")
adp_pos = doc.vocab.strings.add("ADP")
conj_label = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep == det_label and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_pos else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj_label:
head = word.head
while head.dep == conj and head.head.i < head.i:
while head.dep == conj_label and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ...language import Language, BaseDefaults
from .lemmatizer import ItalianLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS
class ItalianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
class Italian(Language):

View File

@ -0,0 +1,86 @@
from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
dets = ["det", "det:poss"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_labels = {doc.vocab.strings.add(det) for det in dets}
det_pos = doc.vocab.strings.add("DET")
adp_label = doc.vocab.strings.add("ADP")
conj = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep in det_labels and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_label else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av
bak bare bedre beste blant ble bli blir blitt bris by både
da dag de del dem den denne der dermed det dette disse drept du
da dag de del dem den denne der dermed det dette disse du
eller en enn er et ett etter
fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
fem fikk fire fjor flere folk for fortsatt fra fram
funnet får fått før først første
gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn går
ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
hvorfor
ha hadde ham han hans har hele helt henne hennes her hun
i ifølge igjen ikke ingen inn
ja jeg
kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
kvinner
la laget land landet langt leder ligger like litt løpet lørdag
la laget land landet langt leder ligger like litt løpet
man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
millioner minutter mot msci mye mål måtte
man mange med meg mellom men mener mennesker mens mer mot mye mål måtte
ned neste noe noen nok norge norsk norske ntb ny nye når
ned neste noe noen nok ny nye når
og også om onsdag opp opplyser oslo oss over
og også om opp opplyser oss over
personer plass poeng politidistrikt politiet president prosent på
personer plass poeng på
regjeringen runde rundt russland
runde rundt
sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
sa saken samme sammen samtidig satt se seg seks selv senere ser sett
siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
store står sverige svært søndag
store står svært
ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
tyskland
ta tatt tid tidligere til tilbake tillegg tok tror
under usa ut uten utenfor
under ut uten utenfor
vant var ved veldig vi videre viktig vil ville viser vår være vært

View File

@ -1,13 +1,10 @@
# Source: https://github.com/stopwords-iso/stopwords-sl
# TODO: probably needs to be tidied up the list seems to have month names in
# it, which shouldn't be considered stop words.
# Removed various words that are not normally considered stop words, such as months.
STOP_WORDS = set(
"""
a
ali
april
avgust
b
bi
bil
@ -19,7 +16,6 @@ biti
blizu
bo
bodo
bojo
bolj
bom
bomo
@ -37,16 +33,6 @@ da
daleč
dan
danes
datum
december
deset
deseta
deseti
deseto
devet
deveta
deveti
deveto
do
dober
dobra
@ -54,16 +40,7 @@ dobri
dobro
dokler
dol
dolg
dolga
dolgi
dovolj
drug
druga
drugi
drugo
dva
dve
e
eden
en
@ -74,7 +51,6 @@ enkrat
eno
etc.
f
februar
g
g.
ga
@ -93,16 +69,12 @@ iv
ix
iz
j
januar
jaz
je
ji
jih
jim
jo
julij
junij
jutri
k
kadarkoli
kaj
@ -123,41 +95,23 @@ kje
kjer
kjerkoli
ko
koder
koderkoli
koga
komu
kot
kratek
kratka
kratke
kratki
l
lahka
lahke
lahki
lahko
le
lep
lepa
lepe
lepi
lepo
leto
m
maj
majhen
majhna
majhni
malce
malo
manj
marec
me
med
medtem
mene
mesec
mi
midva
midve
@ -183,7 +137,6 @@ najmanj
naju
največ
nam
narobe
nas
nato
nazaj
@ -192,7 +145,6 @@ naša
naše
ne
nedavno
nedelja
nek
neka
nekaj
@ -236,7 +188,6 @@ njuna
njuno
no
nocoj
november
npr.
o
ob
@ -244,51 +195,23 @@ oba
obe
oboje
od
odprt
odprta
odprti
okoli
oktober
on
onadva
one
oni
onidve
osem
osma
osmi
osmo
oz.
p
pa
pet
peta
petek
peti
peto
po
pod
pogosto
poleg
poln
polna
polni
polno
ponavadi
ponedeljek
ponovno
potem
povsod
pozdravljen
pozdravljeni
prav
prava
prave
pravi
pravo
prazen
prazna
prazno
prbl.
precej
pred
@ -297,19 +220,10 @@ preko
pri
pribl.
približno
primer
pripravljen
pripravljena
pripravljeni
proti
prva
prvi
prvo
r
ravno
redko
res
reč
s
saj
sam
@ -321,29 +235,17 @@ se
sebe
sebi
sedaj
sedem
sedma
sedmi
sedmo
sem
september
seveda
si
sicer
skoraj
skozi
slab
smo
so
sobota
spet
sreda
srednja
srednji
sta
ste
stran
stvar
sva
t
ta
@ -358,10 +260,6 @@ te
tebe
tebi
tega
težak
težka
težki
težko
ti
tista
tiste
@ -371,11 +269,6 @@ tj.
tja
to
toda
torek
tretja
tretje
tretji
tri
tu
tudi
tukaj
@ -392,10 +285,6 @@ vaša
vaše
ve
vedno
velik
velika
veliki
veliko
vendar
ves
več
@ -403,10 +292,6 @@ vi
vidva
vii
viii
visok
visoka
visoke
visoki
vsa
vsaj
vsak
@ -420,34 +305,21 @@ vsega
vsi
vso
včasih
včeraj
x
z
za
zadaj
zadnji
zakaj
zaprta
zaprti
zaprto
zdaj
zelo
zunaj
č
če
često
četrta
četrtek
četrti
četrto
čez
čigav
š
šest
šesta
šesti
šesto
štiri
ž
že
""".split()

View File

@ -155,6 +155,11 @@ def fr_tokenizer():
return get_lang_class("fr")().tokenizer
@pytest.fixture(scope="session")
def fr_vocab():
return get_lang_class("fr")().vocab
@pytest.fixture(scope="session")
def ga_tokenizer():
return get_lang_class("ga")().tokenizer
@ -205,6 +210,11 @@ def it_tokenizer():
return get_lang_class("it")().tokenizer
@pytest.fixture(scope="session")
def it_vocab():
return get_lang_class("it")().vocab
@pytest.fixture(scope="session")
def ja_tokenizer():
pytest.importorskip("sudachipy")

View File

@ -1,8 +1,230 @@
from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un nom -> un nom
(
["un", "nom"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + noun starting with vowel
# l'heure -> l'heure
(
["l'", "heure"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + plural noun
# les romans -> les romans
(
["les", "romans"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# det + adj + noun
# Le vieux Londres -> Le vieux Londres
(
['Les', 'vieux', 'Londres'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + noun + adj
# le nom propre -> le nom propre a proper noun
(
["le", "nom", "propre"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# det + noun + adj plural
# Les chiens bruns -> les chiens bruns
(
["Les", "chiens", "bruns"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# multiple adjectives: one adj before the noun, one adj after the noun
# un nouveau film intéressant -> un nouveau film intéressant
(
["un", "nouveau", "film", "intéressant"],
[2, 2, 2, 2],
["det", "amod", "ROOT", "amod"],
["DET", "ADJ", "NOUN", "ADJ"],
[(0,4)]
),
# multiple adjectives, both adjs after the noun
# une personne intelligente et drôle -> une personne intelligente et drôle
(
["une", "personne", "intelligente", "et", "drôle"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# relative pronoun
# un bus qui va au ville -> un bus, qui, ville
(
['un', 'bus', 'qui', 'va', 'au', 'ville'],
[1, 1, 3, 1, 5, 3],
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
[(0,2), (2,3), (5,6)]
),
# relative subclause
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
(
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
[0, 2, 0, 5, 5, 2, 5],
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
[(1,3), (4,5)]
),
# Person name and title by flat
# Louis XIV -> Louis XIV
(
["Louis", "XIV"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Organization name by flat
# Nations Unies -> Nations Unies
(
["Nations", "Unies"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Noun compound, person name created by two flats
# Louise de Bratagne -> Louise de Bratagne
(
["Louise", "de", "Bratagne"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound, person name created by two flats
# Louis François Joseph -> Louis François Joseph
(
["Louis", "François", "Joseph"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# quelques agriculteurs très riches -> quelques agriculteurs très riches
(
["quelques", "agriculteurs", "très", "riches"],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Il a un chien et un chat -> Il, un chien, un chat
(
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
[1, 1, 3, 1, 6, 6, 3],
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(0,1), (2,4), (5,7)]
),
# Two NPs together
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
(
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# nmod relation between NPs
# la destruction de la ville -> la destruction, la ville
(
['la', 'destruction', 'de', 'la', 'ville'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'case', 'det', 'nmod'],
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
[(0,2), (3,5)]
),
# nmod relation between NPs
# Archiduchesse dAutriche -> Archiduchesse, Autriche
(
['Archiduchesse', 'd', 'Autriche'],
[0, 2, 0],
['ROOT', 'case', 'nmod'],
['NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3)]
),
# Compounding by nmod, several NPs chained together
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
(
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduction du rapport de Susana -> Traduction, rapport, Susana
(
['Traduction', 'du', 'raport', 'de', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
(
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
[2, 2, 2, 4, 2, 7, 7, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
[(0,3), (4,5), (6,8)]
),
# Passive subject
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
(
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
[(0, 3), (6, 10), (11, 12)]
)
],
)
# fmt: on
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
doc = fr_tokenizer("trouver des travaux antérieurs")
doc = fr_tokenizer("Je suis allé à l'école")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -0,0 +1,221 @@
from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un pollo -> un pollo
(
["un", "pollo"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0,2)],
),
# two determiners + noun
# il mio cane -> il mio cane
(
["il", "mio", "cane"],
[2, 2, 2],
["det", "det:poss", "ROOT"],
["DET", "DET", "NOUN"],
[(0,3)],
),
# two determiners, one is after noun. rare usage but still testing
# il cane mio-> il cane mio
(
["il", "cane", "mio"],
[1, 1, 1],
["det", "ROOT", "det:poss"],
["DET", "NOUN", "DET"],
[(0,3)],
),
# relative pronoun
# È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty.
(
["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
[2, 2, 2, 4, 2, 7, 7, 4],
['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
[(3,5), (5,6)]
),
# relative subclause
# il computer che hai comprato -> il computer, che the computer that you bought
(
['il', 'computer', 'che', 'hai', 'comprato'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
[(0,2), (2,3)]
),
# det + noun + adj
# Una macchina grande -> Una macchina grande
(
["Una", "macchina", "grande"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0,3)],
),
# noun + adj plural
# mucche bianche
(
["mucche", "bianche"],
[0, 0],
["ROOT", "amod"],
["NOUN", "ADJ"],
[(0,2)],
),
# det + adj + noun
# Una grande macchina -> Una grande macchina
(
['Una', 'grande', 'macchina'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + adj + noun, det with apostrophe
# un'importante associazione -> un'importante associazione
(
["Un'", 'importante', 'associazione'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# multiple adjectives
# Un cane piccolo e marrone -> Un cane piccolo e marrone
(
["Un", "cane", "piccolo", "e", "marrone"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# determiner, adjective, compound created by flat
# le Nazioni Unite -> le Nazioni Unite
(
["le", "Nazioni", "Unite"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers
(
['alcuni', 'contadini', 'molto', 'ricchi'],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Ho un cane e un gatto -> un cane, un gatto
(
['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
[0, 2, 0, 5, 5, 0],
['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(1,3), (4,6)]
),
# Two NPs together
# lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
(
['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# Noun compound, person name and titles
# Dom Pedro II -> Dom Pedro II
(
["Dom", "Pedro", "II"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound created by flat
# gli Stati Uniti
(
["gli", "Stati", "Uniti"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# nmod relation between NPs
# la distruzione della città -> la distruzione, città
(
['la', 'distruzione', 'della', 'città'],
[1, 1, 3, 1],
['det', 'ROOT', 'case', 'nmod'],
['DET', 'NOUN', 'ADP', 'NOUN'],
[(0,2), (3,4)]
),
# Compounding by nmod, several NPs chained together
# la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
(
["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
(
['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
(
['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
[1, 1, 1, 4, 1, 8, 8, 8, 1],
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
[(0,3), (4,5), (6,9)]
),
# Passive subject
# La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton
(
['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
[2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0, 3), (6, 8), (9, 10), (11,12)]
),
# Misc
# Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
(
['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
[15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(2,4), (9,12), (13,14), (17,18), (19,20)]
)
],
)
# fmt: on
def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_it(it_tokenizer):
"""Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
doc = it_tokenizer("Sei andato a Oxford")
with pytest.raises(ValueError):
list(doc.noun_chunks)