Bump sudachipy version (#9917)

* Edited Slovenian stop words list (#9707)

* Noun chunks for Italian (#9662)

* added it vocab

* copied portuguese

* added possessive determiner

* added conjed Nps

* added nmoded Nps

* test misc

* more examples

* fixed typo

* fixed parenth

* fixed comma

* comma fix

* added syntax iters

* fix some index problems

* fixed index

* corrected heads for test case

* fixed tets case

* fixed determiner gender

* cleaned left over

* added example with apostophe

* French NP review (#9667)

* adapted from pt

* added basic tests

* added fr vocab

* fixed noun chunks

* more examples

* typo fix

* changed naming

* changed the naming

* typo fix

* Add Japanese kana characters to default exceptions (fix #9693) (#9742)

This includes the main kana, or phonetic characters, used in Japanese.

There are some supplemental kana blocks in Unicode outside the BMP that
could also be included, but because their actual use is rare I omitted
them for now, but maybe they should be added. The omitted blocks are:

- Kana Supplement
- Kana Extended (A and B)
- Small Kana Extension

* Remove NER words from stop words in Norwegian (#9820)

Default stop words in Norwegian bokmål (nb) in Spacy contain important entities, e.g. France, Germany, Russia, Sweden and USA, police district, important units of time, e.g. months and days of the week, and organisations.

Nobody expects their presence among the default stop words. There is a danger of users complying with the general recommendation of filtering out stop words, while being unaware of filtering out important entities from their data.

See explanation in https://github.com/explosion/spaCy/issues/3052#issuecomment-986756711 and comment https://github.com/explosion/spaCy/issues/3052#issuecomment-986951831

* Bump sudachipy version

* Update sudachipy versions

* Bump versions

Bumping to the most recent dictionary just to keep thing current.
Bumping sudachipy to 5.2 because older versions don't support recent
dictionaries.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Richard Hudson <richard@explosion.ai>
Co-authored-by: Duygu Altinok <duygu@explosion.ai>
Co-authored-by: Haakon Meland Eriksen <haakon.eriksen@far.no>
This commit is contained in:
Paul O'Leary McCann 2022-01-17 16:16:22 +09:00 committed by GitHub
parent a784b12eff
commit 58bdd8607b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 624 additions and 162 deletions

View File

@ -108,8 +108,8 @@ apple =
thinc-apple-ops>=0.0.4,<1.0.0 thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies # Language tokenizers with external dependencies
ja = ja =
sudachipy>=0.4.9 sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20200330 sudachidict_core>=20211220
ko = ko =
natto-py==0.9.0 natto-py==0.9.0
th = th =

View File

@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
_hangul_jamo = r"\u1100-\u11FF" _hangul_jamo = r"\u1100-\u11FF"
_hangul = _hangul_syllables + _hangul_jamo _hangul = _hangul_syllables + _hangul_jamo
_hiragana = r"\u3040-\u309F"
_katakana = r"\u30A0-\u30FFー"
_kana = _hiragana + _katakana
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
_latin_u_extendedA = ( _latin_u_extendedA = (
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@ -244,6 +248,7 @@ _uncased = (
+ _tamil + _tamil
+ _telugu + _telugu
+ _hangul + _hangul
+ _kana
+ _cjk + _cjk
) )

View File

@ -6,16 +6,35 @@ from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span.""" """
# fmt: off Detect base noun phrases from a dependency parse. Works on both Doc and Span.
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] """
# fmt: on labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"obl:arg",
"obl:mod",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"): if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = {doc.vocab.strings.add(label) for label in labels}
conj = doc.vocab.strings.add("conj") np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_label = doc.vocab.strings.add("det")
det_pos = doc.vocab.strings.add("DET")
adp_pos = doc.vocab.strings.add("ADP")
conj_label = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1 prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
if word.left_edge.i <= prev_end: if word.left_edge.i <= prev_end:
continue continue
if word.dep in np_deps: if word.dep in np_deps:
prev_end = word.right_edge.i right_childs = list(word.rights)
yield word.left_edge.i, word.right_edge.i + 1, np_label right_child = right_childs[0] if right_childs else None
elif word.dep == conj:
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep == det_label and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_pos else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj_label:
head = word.head head = word.head
while head.dep == conj and head.head.i < head.i: while head.dep == conj_label and head.head.i < head.i:
head = head.head head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps: if head.dep in np_deps:
prev_end = word.right_edge.i prev_end = word.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from .lemmatizer import ItalianLemmatizer from .lemmatizer import ItalianLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS
class ItalianDefaults(BaseDefaults): class ItalianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
class Italian(Language): class Italian(Language):

View File

@ -0,0 +1,86 @@
from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
dets = ["det", "det:poss"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_labels = {doc.vocab.strings.add(det) for det in dets}
det_pos = doc.vocab.strings.add("DET")
adp_label = doc.vocab.strings.add("ADP")
conj = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep in det_labels and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_label else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av
bak bare bedre beste blant ble bli blir blitt bris by både bak bare bedre beste blant ble bli blir blitt bris by både
da dag de del dem den denne der dermed det dette disse drept du da dag de del dem den denne der dermed det dette disse du
eller en enn er et ett etter eller en enn er et ett etter
fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag fem fikk fire fjor flere folk for fortsatt fra fram
funnet får fått før først første funnet får fått før først første
gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn går gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn går
ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan ha hadde ham han hans har hele helt henne hennes her hun
hvorfor
i ifølge igjen ikke ingen inn i ifølge igjen ikke ingen inn
ja jeg ja jeg
kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
kvinner
la laget land landet langt leder ligger like litt løpet lørdag la laget land landet langt leder ligger like litt løpet
man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer man mange med meg mellom men mener mennesker mens mer mot mye mål måtte
millioner minutter mot msci mye mål måtte
ned neste noe noen nok norge norsk norske ntb ny nye når ned neste noe noen nok ny nye når
og også om onsdag opp opplyser oslo oss over og også om opp opplyser oss over
personer plass poeng politidistrikt politiet president prosent på personer plass poeng på
regjeringen runde rundt russland runde rundt
sa saken samme sammen samtidig satt se seg seks selv senere september ser sett sa saken samme sammen samtidig satt se seg seks selv senere ser sett
siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
store står sverige svært søndag store står svært
ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror ta tatt tid tidligere til tilbake tillegg tok tror
tyskland
under usa ut uten utenfor under ut uten utenfor
vant var ved veldig vi videre viktig vil ville viser vår være vært vant var ved veldig vi videre viktig vil ville viser vår være vært

View File

@ -1,13 +1,10 @@
# Source: https://github.com/stopwords-iso/stopwords-sl # Source: https://github.com/stopwords-iso/stopwords-sl
# TODO: probably needs to be tidied up the list seems to have month names in # Removed various words that are not normally considered stop words, such as months.
# it, which shouldn't be considered stop words.
STOP_WORDS = set( STOP_WORDS = set(
""" """
a a
ali ali
april
avgust
b b
bi bi
bil bil
@ -19,7 +16,6 @@ biti
blizu blizu
bo bo
bodo bodo
bojo
bolj bolj
bom bom
bomo bomo
@ -37,16 +33,6 @@ da
daleč daleč
dan dan
danes danes
datum
december
deset
deseta
deseti
deseto
devet
deveta
deveti
deveto
do do
dober dober
dobra dobra
@ -54,16 +40,7 @@ dobri
dobro dobro
dokler dokler
dol dol
dolg
dolga
dolgi
dovolj dovolj
drug
druga
drugi
drugo
dva
dve
e e
eden eden
en en
@ -74,7 +51,6 @@ enkrat
eno eno
etc. etc.
f f
februar
g g
g. g.
ga ga
@ -93,16 +69,12 @@ iv
ix ix
iz iz
j j
januar
jaz jaz
je je
ji ji
jih jih
jim jim
jo jo
julij
junij
jutri
k k
kadarkoli kadarkoli
kaj kaj
@ -123,41 +95,23 @@ kje
kjer kjer
kjerkoli kjerkoli
ko ko
koder
koderkoli koderkoli
koga koga
komu komu
kot kot
kratek
kratka
kratke
kratki
l l
lahka
lahke
lahki
lahko
le le
lep lep
lepa lepa
lepe lepe
lepi lepi
lepo lepo
leto
m m
maj
majhen
majhna
majhni
malce
malo
manj manj
marec
me me
med med
medtem medtem
mene mene
mesec
mi mi
midva midva
midve midve
@ -183,7 +137,6 @@ najmanj
naju naju
največ največ
nam nam
narobe
nas nas
nato nato
nazaj nazaj
@ -192,7 +145,6 @@ naša
naše naše
ne ne
nedavno nedavno
nedelja
nek nek
neka neka
nekaj nekaj
@ -236,7 +188,6 @@ njuna
njuno njuno
no no
nocoj nocoj
november
npr. npr.
o o
ob ob
@ -244,51 +195,23 @@ oba
obe obe
oboje oboje
od od
odprt
odprta
odprti
okoli okoli
oktober
on on
onadva onadva
one one
oni oni
onidve onidve
osem
osma
osmi
osmo
oz. oz.
p p
pa pa
pet
peta
petek
peti
peto
po po
pod pod
pogosto pogosto
poleg poleg
poln
polna
polni
polno
ponavadi ponavadi
ponedeljek
ponovno ponovno
potem potem
povsod povsod
pozdravljen
pozdravljeni
prav
prava
prave
pravi
pravo
prazen
prazna
prazno
prbl. prbl.
precej precej
pred pred
@ -297,19 +220,10 @@ preko
pri pri
pribl. pribl.
približno približno
primer
pripravljen
pripravljena
pripravljeni
proti proti
prva
prvi
prvo
r r
ravno
redko redko
res res
reč
s s
saj saj
sam sam
@ -321,29 +235,17 @@ se
sebe sebe
sebi sebi
sedaj sedaj
sedem
sedma
sedmi
sedmo
sem sem
september
seveda seveda
si si
sicer sicer
skoraj skoraj
skozi skozi
slab
smo smo
so so
sobota
spet spet
sreda
srednja
srednji
sta sta
ste ste
stran
stvar
sva sva
t t
ta ta
@ -358,10 +260,6 @@ te
tebe tebe
tebi tebi
tega tega
težak
težka
težki
težko
ti ti
tista tista
tiste tiste
@ -371,11 +269,6 @@ tj.
tja tja
to to
toda toda
torek
tretja
tretje
tretji
tri
tu tu
tudi tudi
tukaj tukaj
@ -392,10 +285,6 @@ vaša
vaše vaše
ve ve
vedno vedno
velik
velika
veliki
veliko
vendar vendar
ves ves
več več
@ -403,10 +292,6 @@ vi
vidva vidva
vii vii
viii viii
visok
visoka
visoke
visoki
vsa vsa
vsaj vsaj
vsak vsak
@ -420,34 +305,21 @@ vsega
vsi vsi
vso vso
včasih včasih
včeraj
x x
z z
za za
zadaj zadaj
zadnji zadnji
zakaj zakaj
zaprta
zaprti
zaprto
zdaj zdaj
zelo zelo
zunaj zunaj
č č
če če
često često
četrta
četrtek
četrti
četrto
čez čez
čigav čigav
š š
šest
šesta
šesti
šesto
štiri
ž ž
že že
""".split() """.split()

View File

@ -155,6 +155,11 @@ def fr_tokenizer():
return get_lang_class("fr")().tokenizer return get_lang_class("fr")().tokenizer
@pytest.fixture(scope="session")
def fr_vocab():
return get_lang_class("fr")().vocab
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ga_tokenizer(): def ga_tokenizer():
return get_lang_class("ga")().tokenizer return get_lang_class("ga")().tokenizer
@ -205,6 +210,11 @@ def it_tokenizer():
return get_lang_class("it")().tokenizer return get_lang_class("it")().tokenizer
@pytest.fixture(scope="session")
def it_vocab():
return get_lang_class("it")().vocab
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ja_tokenizer(): def ja_tokenizer():
pytest.importorskip("sudachipy") pytest.importorskip("sudachipy")

View File

@ -1,8 +1,230 @@
from spacy.tokens import Doc
import pytest import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un nom -> un nom
(
["un", "nom"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + noun starting with vowel
# l'heure -> l'heure
(
["l'", "heure"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + plural noun
# les romans -> les romans
(
["les", "romans"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# det + adj + noun
# Le vieux Londres -> Le vieux Londres
(
['Les', 'vieux', 'Londres'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + noun + adj
# le nom propre -> le nom propre a proper noun
(
["le", "nom", "propre"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# det + noun + adj plural
# Les chiens bruns -> les chiens bruns
(
["Les", "chiens", "bruns"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# multiple adjectives: one adj before the noun, one adj after the noun
# un nouveau film intéressant -> un nouveau film intéressant
(
["un", "nouveau", "film", "intéressant"],
[2, 2, 2, 2],
["det", "amod", "ROOT", "amod"],
["DET", "ADJ", "NOUN", "ADJ"],
[(0,4)]
),
# multiple adjectives, both adjs after the noun
# une personne intelligente et drôle -> une personne intelligente et drôle
(
["une", "personne", "intelligente", "et", "drôle"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# relative pronoun
# un bus qui va au ville -> un bus, qui, ville
(
['un', 'bus', 'qui', 'va', 'au', 'ville'],
[1, 1, 3, 1, 5, 3],
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
[(0,2), (2,3), (5,6)]
),
# relative subclause
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
(
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
[0, 2, 0, 5, 5, 2, 5],
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
[(1,3), (4,5)]
),
# Person name and title by flat
# Louis XIV -> Louis XIV
(
["Louis", "XIV"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Organization name by flat
# Nations Unies -> Nations Unies
(
["Nations", "Unies"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Noun compound, person name created by two flats
# Louise de Bratagne -> Louise de Bratagne
(
["Louise", "de", "Bratagne"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound, person name created by two flats
# Louis François Joseph -> Louis François Joseph
(
["Louis", "François", "Joseph"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# quelques agriculteurs très riches -> quelques agriculteurs très riches
(
["quelques", "agriculteurs", "très", "riches"],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Il a un chien et un chat -> Il, un chien, un chat
(
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
[1, 1, 3, 1, 6, 6, 3],
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(0,1), (2,4), (5,7)]
),
# Two NPs together
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
(
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# nmod relation between NPs
# la destruction de la ville -> la destruction, la ville
(
['la', 'destruction', 'de', 'la', 'ville'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'case', 'det', 'nmod'],
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
[(0,2), (3,5)]
),
# nmod relation between NPs
# Archiduchesse dAutriche -> Archiduchesse, Autriche
(
['Archiduchesse', 'd', 'Autriche'],
[0, 2, 0],
['ROOT', 'case', 'nmod'],
['NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3)]
),
# Compounding by nmod, several NPs chained together
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
(
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduction du rapport de Susana -> Traduction, rapport, Susana
(
['Traduction', 'du', 'raport', 'de', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
(
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
[2, 2, 2, 4, 2, 7, 7, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
[(0,3), (4,5), (6,8)]
),
# Passive subject
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
(
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
[(0, 3), (6, 10), (11, 12)]
)
],
)
# fmt: on
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_fr(fr_tokenizer): def test_noun_chunks_is_parsed_fr(fr_tokenizer):
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.""" """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
doc = fr_tokenizer("trouver des travaux antérieurs") doc = fr_tokenizer("Je suis allé à l'école")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -0,0 +1,221 @@
from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un pollo -> un pollo
(
["un", "pollo"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0,2)],
),
# two determiners + noun
# il mio cane -> il mio cane
(
["il", "mio", "cane"],
[2, 2, 2],
["det", "det:poss", "ROOT"],
["DET", "DET", "NOUN"],
[(0,3)],
),
# two determiners, one is after noun. rare usage but still testing
# il cane mio-> il cane mio
(
["il", "cane", "mio"],
[1, 1, 1],
["det", "ROOT", "det:poss"],
["DET", "NOUN", "DET"],
[(0,3)],
),
# relative pronoun
# È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty.
(
["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
[2, 2, 2, 4, 2, 7, 7, 4],
['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
[(3,5), (5,6)]
),
# relative subclause
# il computer che hai comprato -> il computer, che the computer that you bought
(
['il', 'computer', 'che', 'hai', 'comprato'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
[(0,2), (2,3)]
),
# det + noun + adj
# Una macchina grande -> Una macchina grande
(
["Una", "macchina", "grande"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0,3)],
),
# noun + adj plural
# mucche bianche
(
["mucche", "bianche"],
[0, 0],
["ROOT", "amod"],
["NOUN", "ADJ"],
[(0,2)],
),
# det + adj + noun
# Una grande macchina -> Una grande macchina
(
['Una', 'grande', 'macchina'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + adj + noun, det with apostrophe
# un'importante associazione -> un'importante associazione
(
["Un'", 'importante', 'associazione'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# multiple adjectives
# Un cane piccolo e marrone -> Un cane piccolo e marrone
(
["Un", "cane", "piccolo", "e", "marrone"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# determiner, adjective, compound created by flat
# le Nazioni Unite -> le Nazioni Unite
(
["le", "Nazioni", "Unite"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers
(
['alcuni', 'contadini', 'molto', 'ricchi'],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Ho un cane e un gatto -> un cane, un gatto
(
['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
[0, 2, 0, 5, 5, 0],
['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(1,3), (4,6)]
),
# Two NPs together
# lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
(
['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# Noun compound, person name and titles
# Dom Pedro II -> Dom Pedro II
(
["Dom", "Pedro", "II"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound created by flat
# gli Stati Uniti
(
["gli", "Stati", "Uniti"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# nmod relation between NPs
# la distruzione della città -> la distruzione, città
(
['la', 'distruzione', 'della', 'città'],
[1, 1, 3, 1],
['det', 'ROOT', 'case', 'nmod'],
['DET', 'NOUN', 'ADP', 'NOUN'],
[(0,2), (3,4)]
),
# Compounding by nmod, several NPs chained together
# la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
(
["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
(
['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
(
['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
[1, 1, 1, 4, 1, 8, 8, 8, 1],
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
[(0,3), (4,5), (6,9)]
),
# Passive subject
# La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton
(
['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
[2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0, 3), (6, 8), (9, 10), (11,12)]
),
# Misc
# Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
(
['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
[15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(2,4), (9,12), (13,14), (17,18), (19,20)]
)
],
)
# fmt: on
def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_it(it_tokenizer):
"""Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
doc = it_tokenizer("Sei andato a Oxford")
with pytest.raises(ValueError):
list(doc.noun_chunks)