mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Bump sudachipy version (#9917)
* Edited Slovenian stop words list (#9707) * Noun chunks for Italian (#9662) * added it vocab * copied portuguese * added possessive determiner * added conjed Nps * added nmoded Nps * test misc * more examples * fixed typo * fixed parenth * fixed comma * comma fix * added syntax iters * fix some index problems * fixed index * corrected heads for test case * fixed tets case * fixed determiner gender * cleaned left over * added example with apostophe * French NP review (#9667) * adapted from pt * added basic tests * added fr vocab * fixed noun chunks * more examples * typo fix * changed naming * changed the naming * typo fix * Add Japanese kana characters to default exceptions (fix #9693) (#9742) This includes the main kana, or phonetic characters, used in Japanese. There are some supplemental kana blocks in Unicode outside the BMP that could also be included, but because their actual use is rare I omitted them for now, but maybe they should be added. The omitted blocks are: - Kana Supplement - Kana Extended (A and B) - Small Kana Extension * Remove NER words from stop words in Norwegian (#9820) Default stop words in Norwegian bokmål (nb) in Spacy contain important entities, e.g. France, Germany, Russia, Sweden and USA, police district, important units of time, e.g. months and days of the week, and organisations. Nobody expects their presence among the default stop words. There is a danger of users complying with the general recommendation of filtering out stop words, while being unaware of filtering out important entities from their data. See explanation in https://github.com/explosion/spaCy/issues/3052#issuecomment-986756711 and comment https://github.com/explosion/spaCy/issues/3052#issuecomment-986951831 * Bump sudachipy version * Update sudachipy versions * Bump versions Bumping to the most recent dictionary just to keep thing current. Bumping sudachipy to 5.2 because older versions don't support recent dictionaries. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Richard Hudson <richard@explosion.ai> Co-authored-by: Duygu Altinok <duygu@explosion.ai> Co-authored-by: Haakon Meland Eriksen <haakon.eriksen@far.no>
This commit is contained in:
parent
a784b12eff
commit
58bdd8607b
|
@ -108,8 +108,8 @@ apple =
|
|||
thinc-apple-ops>=0.0.4,<1.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
sudachipy>=0.4.9
|
||||
sudachidict_core>=20200330
|
||||
sudachipy>=0.5.2,!=0.6.1
|
||||
sudachidict_core>=20211220
|
||||
ko =
|
||||
natto-py==0.9.0
|
||||
th =
|
||||
|
|
|
@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
|
|||
_hangul_jamo = r"\u1100-\u11FF"
|
||||
_hangul = _hangul_syllables + _hangul_jamo
|
||||
|
||||
_hiragana = r"\u3040-\u309F"
|
||||
_katakana = r"\u30A0-\u30FFー"
|
||||
_kana = _hiragana + _katakana
|
||||
|
||||
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
|
||||
_latin_u_extendedA = (
|
||||
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
|
||||
|
@ -244,6 +248,7 @@ _uncased = (
|
|||
+ _tamil
|
||||
+ _telugu
|
||||
+ _hangul
|
||||
+ _kana
|
||||
+ _cjk
|
||||
)
|
||||
|
||||
|
|
|
@ -6,16 +6,35 @@ from ...tokens import Doc, Span
|
|||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
# fmt: off
|
||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = [
|
||||
"nsubj",
|
||||
"nsubj:pass",
|
||||
"obj",
|
||||
"obl",
|
||||
"obl:agent",
|
||||
"obl:arg",
|
||||
"obl:mod",
|
||||
"nmod",
|
||||
"pcomp",
|
||||
"appos",
|
||||
"ROOT",
|
||||
]
|
||||
post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
||||
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
adj_label = doc.vocab.strings.add("amod")
|
||||
det_label = doc.vocab.strings.add("det")
|
||||
det_pos = doc.vocab.strings.add("DET")
|
||||
adp_pos = doc.vocab.strings.add("ADP")
|
||||
conj_label = doc.vocab.strings.add("conj")
|
||||
conj_pos = doc.vocab.strings.add("CCONJ")
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
|
@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
|||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
right_childs = list(word.rights)
|
||||
right_child = right_childs[0] if right_childs else None
|
||||
|
||||
if right_child:
|
||||
if (
|
||||
right_child.dep == adj_label
|
||||
): # allow chain of adjectives by expanding to right
|
||||
right_end = right_child.right_edge
|
||||
elif (
|
||||
right_child.dep == det_label and right_child.pos == det_pos
|
||||
): # cut relative pronouns here
|
||||
right_end = right_child
|
||||
elif right_child.dep in np_modifs: # Check if we can expand to right
|
||||
right_end = word.right_edge
|
||||
else:
|
||||
right_end = word
|
||||
else:
|
||||
right_end = word
|
||||
prev_end = right_end.i
|
||||
|
||||
left_index = word.left_edge.i
|
||||
left_index = (
|
||||
left_index + 1 if word.left_edge.pos == adp_pos else left_index
|
||||
)
|
||||
|
||||
yield left_index, right_end.i + 1, np_label
|
||||
elif word.dep == conj_label:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
while head.dep == conj_label and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
prev_end = word.i
|
||||
|
||||
left_index = word.left_edge.i # eliminate left attached conjunction
|
||||
left_index = (
|
||||
left_index + 1 if word.left_edge.pos == conj_pos else left_index
|
||||
)
|
||||
yield left_index, word.i + 1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||
|
|
|
@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
from .lemmatizer import ItalianLemmatizer
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class ItalianDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Italian(Language):
|
||||
|
|
86
spacy/lang/it/syntax_iterators.py
Normal file
86
spacy/lang/it/syntax_iterators.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = [
|
||||
"nsubj",
|
||||
"nsubj:pass",
|
||||
"obj",
|
||||
"obl",
|
||||
"obl:agent",
|
||||
"nmod",
|
||||
"pcomp",
|
||||
"appos",
|
||||
"ROOT",
|
||||
]
|
||||
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
|
||||
dets = ["det", "det:poss"]
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
||||
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
adj_label = doc.vocab.strings.add("amod")
|
||||
det_labels = {doc.vocab.strings.add(det) for det in dets}
|
||||
det_pos = doc.vocab.strings.add("DET")
|
||||
adp_label = doc.vocab.strings.add("ADP")
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
conj_pos = doc.vocab.strings.add("CCONJ")
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
right_childs = list(word.rights)
|
||||
right_child = right_childs[0] if right_childs else None
|
||||
|
||||
if right_child:
|
||||
if (
|
||||
right_child.dep == adj_label
|
||||
): # allow chain of adjectives by expanding to right
|
||||
right_end = right_child.right_edge
|
||||
elif (
|
||||
right_child.dep in det_labels and right_child.pos == det_pos
|
||||
): # cut relative pronouns here
|
||||
right_end = right_child
|
||||
elif right_child.dep in np_modifs: # Check if we can expand to right
|
||||
right_end = word.right_edge
|
||||
else:
|
||||
right_end = word
|
||||
else:
|
||||
right_end = word
|
||||
prev_end = right_end.i
|
||||
|
||||
left_index = word.left_edge.i
|
||||
left_index = (
|
||||
left_index + 1 if word.left_edge.pos == adp_label else left_index
|
||||
)
|
||||
|
||||
yield left_index, right_end.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
prev_end = word.i
|
||||
|
||||
left_index = word.left_edge.i # eliminate left attached conjunction
|
||||
left_index = (
|
||||
left_index + 1 if word.left_edge.pos == conj_pos else left_index
|
||||
)
|
||||
yield left_index, word.i + 1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av
|
|||
|
||||
bak bare bedre beste blant ble bli blir blitt bris by både
|
||||
|
||||
da dag de del dem den denne der dermed det dette disse drept du
|
||||
da dag de del dem den denne der dermed det dette disse du
|
||||
|
||||
eller en enn er et ett etter
|
||||
|
||||
fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
|
||||
fem fikk fire fjor flere folk for fortsatt fra fram
|
||||
funnet få får fått før først første
|
||||
|
||||
gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
|
||||
|
||||
ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
|
||||
hvorfor
|
||||
ha hadde ham han hans har hele helt henne hennes her hun
|
||||
|
||||
i ifølge igjen ikke ingen inn
|
||||
|
||||
ja jeg
|
||||
|
||||
kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
|
||||
kvinner
|
||||
|
||||
la laget land landet langt leder ligger like litt løpet lørdag
|
||||
la laget land landet langt leder ligger like litt løpet
|
||||
|
||||
man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
|
||||
millioner minutter mot msci mye må mål måtte
|
||||
man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte
|
||||
|
||||
ned neste noe noen nok norge norsk norske ntb ny nye nå når
|
||||
ned neste noe noen nok ny nye nå når
|
||||
|
||||
og også om onsdag opp opplyser oslo oss over
|
||||
og også om opp opplyser oss over
|
||||
|
||||
personer plass poeng politidistrikt politiet president prosent på
|
||||
personer plass poeng på
|
||||
|
||||
regjeringen runde rundt russland
|
||||
runde rundt
|
||||
|
||||
sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
|
||||
sa saken samme sammen samtidig satt se seg seks selv senere ser sett
|
||||
siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
|
||||
store står sverige svært så søndag
|
||||
store står svært så
|
||||
|
||||
ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
|
||||
tyskland
|
||||
ta tatt tid tidligere til tilbake tillegg tok tror
|
||||
|
||||
under usa ut uten utenfor
|
||||
under ut uten utenfor
|
||||
|
||||
vant var ved veldig vi videre viktig vil ville viser vår være vært
|
||||
|
||||
|
|
|
@ -1,13 +1,10 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-sl
|
||||
# TODO: probably needs to be tidied up – the list seems to have month names in
|
||||
# it, which shouldn't be considered stop words.
|
||||
# Removed various words that are not normally considered stop words, such as months.
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
a
|
||||
ali
|
||||
april
|
||||
avgust
|
||||
b
|
||||
bi
|
||||
bil
|
||||
|
@ -19,7 +16,6 @@ biti
|
|||
blizu
|
||||
bo
|
||||
bodo
|
||||
bojo
|
||||
bolj
|
||||
bom
|
||||
bomo
|
||||
|
@ -37,16 +33,6 @@ da
|
|||
daleč
|
||||
dan
|
||||
danes
|
||||
datum
|
||||
december
|
||||
deset
|
||||
deseta
|
||||
deseti
|
||||
deseto
|
||||
devet
|
||||
deveta
|
||||
deveti
|
||||
deveto
|
||||
do
|
||||
dober
|
||||
dobra
|
||||
|
@ -54,16 +40,7 @@ dobri
|
|||
dobro
|
||||
dokler
|
||||
dol
|
||||
dolg
|
||||
dolga
|
||||
dolgi
|
||||
dovolj
|
||||
drug
|
||||
druga
|
||||
drugi
|
||||
drugo
|
||||
dva
|
||||
dve
|
||||
e
|
||||
eden
|
||||
en
|
||||
|
@ -74,7 +51,6 @@ enkrat
|
|||
eno
|
||||
etc.
|
||||
f
|
||||
februar
|
||||
g
|
||||
g.
|
||||
ga
|
||||
|
@ -93,16 +69,12 @@ iv
|
|||
ix
|
||||
iz
|
||||
j
|
||||
januar
|
||||
jaz
|
||||
je
|
||||
ji
|
||||
jih
|
||||
jim
|
||||
jo
|
||||
julij
|
||||
junij
|
||||
jutri
|
||||
k
|
||||
kadarkoli
|
||||
kaj
|
||||
|
@ -123,41 +95,23 @@ kje
|
|||
kjer
|
||||
kjerkoli
|
||||
ko
|
||||
koder
|
||||
koderkoli
|
||||
koga
|
||||
komu
|
||||
kot
|
||||
kratek
|
||||
kratka
|
||||
kratke
|
||||
kratki
|
||||
l
|
||||
lahka
|
||||
lahke
|
||||
lahki
|
||||
lahko
|
||||
le
|
||||
lep
|
||||
lepa
|
||||
lepe
|
||||
lepi
|
||||
lepo
|
||||
leto
|
||||
m
|
||||
maj
|
||||
majhen
|
||||
majhna
|
||||
majhni
|
||||
malce
|
||||
malo
|
||||
manj
|
||||
marec
|
||||
me
|
||||
med
|
||||
medtem
|
||||
mene
|
||||
mesec
|
||||
mi
|
||||
midva
|
||||
midve
|
||||
|
@ -183,7 +137,6 @@ najmanj
|
|||
naju
|
||||
največ
|
||||
nam
|
||||
narobe
|
||||
nas
|
||||
nato
|
||||
nazaj
|
||||
|
@ -192,7 +145,6 @@ naša
|
|||
naše
|
||||
ne
|
||||
nedavno
|
||||
nedelja
|
||||
nek
|
||||
neka
|
||||
nekaj
|
||||
|
@ -236,7 +188,6 @@ njuna
|
|||
njuno
|
||||
no
|
||||
nocoj
|
||||
november
|
||||
npr.
|
||||
o
|
||||
ob
|
||||
|
@ -244,51 +195,23 @@ oba
|
|||
obe
|
||||
oboje
|
||||
od
|
||||
odprt
|
||||
odprta
|
||||
odprti
|
||||
okoli
|
||||
oktober
|
||||
on
|
||||
onadva
|
||||
one
|
||||
oni
|
||||
onidve
|
||||
osem
|
||||
osma
|
||||
osmi
|
||||
osmo
|
||||
oz.
|
||||
p
|
||||
pa
|
||||
pet
|
||||
peta
|
||||
petek
|
||||
peti
|
||||
peto
|
||||
po
|
||||
pod
|
||||
pogosto
|
||||
poleg
|
||||
poln
|
||||
polna
|
||||
polni
|
||||
polno
|
||||
ponavadi
|
||||
ponedeljek
|
||||
ponovno
|
||||
potem
|
||||
povsod
|
||||
pozdravljen
|
||||
pozdravljeni
|
||||
prav
|
||||
prava
|
||||
prave
|
||||
pravi
|
||||
pravo
|
||||
prazen
|
||||
prazna
|
||||
prazno
|
||||
prbl.
|
||||
precej
|
||||
pred
|
||||
|
@ -297,19 +220,10 @@ preko
|
|||
pri
|
||||
pribl.
|
||||
približno
|
||||
primer
|
||||
pripravljen
|
||||
pripravljena
|
||||
pripravljeni
|
||||
proti
|
||||
prva
|
||||
prvi
|
||||
prvo
|
||||
r
|
||||
ravno
|
||||
redko
|
||||
res
|
||||
reč
|
||||
s
|
||||
saj
|
||||
sam
|
||||
|
@ -321,29 +235,17 @@ se
|
|||
sebe
|
||||
sebi
|
||||
sedaj
|
||||
sedem
|
||||
sedma
|
||||
sedmi
|
||||
sedmo
|
||||
sem
|
||||
september
|
||||
seveda
|
||||
si
|
||||
sicer
|
||||
skoraj
|
||||
skozi
|
||||
slab
|
||||
smo
|
||||
so
|
||||
sobota
|
||||
spet
|
||||
sreda
|
||||
srednja
|
||||
srednji
|
||||
sta
|
||||
ste
|
||||
stran
|
||||
stvar
|
||||
sva
|
||||
t
|
||||
ta
|
||||
|
@ -358,10 +260,6 @@ te
|
|||
tebe
|
||||
tebi
|
||||
tega
|
||||
težak
|
||||
težka
|
||||
težki
|
||||
težko
|
||||
ti
|
||||
tista
|
||||
tiste
|
||||
|
@ -371,11 +269,6 @@ tj.
|
|||
tja
|
||||
to
|
||||
toda
|
||||
torek
|
||||
tretja
|
||||
tretje
|
||||
tretji
|
||||
tri
|
||||
tu
|
||||
tudi
|
||||
tukaj
|
||||
|
@ -392,10 +285,6 @@ vaša
|
|||
vaše
|
||||
ve
|
||||
vedno
|
||||
velik
|
||||
velika
|
||||
veliki
|
||||
veliko
|
||||
vendar
|
||||
ves
|
||||
več
|
||||
|
@ -403,10 +292,6 @@ vi
|
|||
vidva
|
||||
vii
|
||||
viii
|
||||
visok
|
||||
visoka
|
||||
visoke
|
||||
visoki
|
||||
vsa
|
||||
vsaj
|
||||
vsak
|
||||
|
@ -420,34 +305,21 @@ vsega
|
|||
vsi
|
||||
vso
|
||||
včasih
|
||||
včeraj
|
||||
x
|
||||
z
|
||||
za
|
||||
zadaj
|
||||
zadnji
|
||||
zakaj
|
||||
zaprta
|
||||
zaprti
|
||||
zaprto
|
||||
zdaj
|
||||
zelo
|
||||
zunaj
|
||||
č
|
||||
če
|
||||
često
|
||||
četrta
|
||||
četrtek
|
||||
četrti
|
||||
četrto
|
||||
čez
|
||||
čigav
|
||||
š
|
||||
šest
|
||||
šesta
|
||||
šesti
|
||||
šesto
|
||||
štiri
|
||||
ž
|
||||
že
|
||||
""".split()
|
||||
|
|
|
@ -155,6 +155,11 @@ def fr_tokenizer():
|
|||
return get_lang_class("fr")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def fr_vocab():
|
||||
return get_lang_class("fr")().vocab
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ga_tokenizer():
|
||||
return get_lang_class("ga")().tokenizer
|
||||
|
@ -205,6 +210,11 @@ def it_tokenizer():
|
|||
return get_lang_class("it")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def it_vocab():
|
||||
return get_lang_class("it")().vocab
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ja_tokenizer():
|
||||
pytest.importorskip("sudachipy")
|
||||
|
|
|
@ -1,8 +1,230 @@
|
|||
from spacy.tokens import Doc
|
||||
import pytest
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.parametrize(
|
||||
"words,heads,deps,pos,chunk_offsets",
|
||||
[
|
||||
# determiner + noun
|
||||
# un nom -> un nom
|
||||
(
|
||||
["un", "nom"],
|
||||
[1, 1],
|
||||
["det", "ROOT"],
|
||||
["DET", "NOUN"],
|
||||
[(0, 2)],
|
||||
),
|
||||
# determiner + noun starting with vowel
|
||||
# l'heure -> l'heure
|
||||
(
|
||||
["l'", "heure"],
|
||||
[1, 1],
|
||||
["det", "ROOT"],
|
||||
["DET", "NOUN"],
|
||||
[(0, 2)],
|
||||
),
|
||||
# determiner + plural noun
|
||||
# les romans -> les romans
|
||||
(
|
||||
["les", "romans"],
|
||||
[1, 1],
|
||||
["det", "ROOT"],
|
||||
["DET", "NOUN"],
|
||||
[(0, 2)],
|
||||
),
|
||||
# det + adj + noun
|
||||
# Le vieux Londres -> Le vieux Londres
|
||||
(
|
||||
['Les', 'vieux', 'Londres'],
|
||||
[2, 2, 2],
|
||||
["det", "amod", "ROOT"],
|
||||
["DET", "ADJ", "NOUN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# det + noun + adj
|
||||
# le nom propre -> le nom propre a proper noun
|
||||
(
|
||||
["le", "nom", "propre"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "amod"],
|
||||
["DET", "NOUN", "ADJ"],
|
||||
[(0, 3)],
|
||||
),
|
||||
# det + noun + adj plural
|
||||
# Les chiens bruns -> les chiens bruns
|
||||
(
|
||||
["Les", "chiens", "bruns"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "amod"],
|
||||
["DET", "NOUN", "ADJ"],
|
||||
[(0, 3)],
|
||||
),
|
||||
# multiple adjectives: one adj before the noun, one adj after the noun
|
||||
# un nouveau film intéressant -> un nouveau film intéressant
|
||||
(
|
||||
["un", "nouveau", "film", "intéressant"],
|
||||
[2, 2, 2, 2],
|
||||
["det", "amod", "ROOT", "amod"],
|
||||
["DET", "ADJ", "NOUN", "ADJ"],
|
||||
[(0,4)]
|
||||
),
|
||||
# multiple adjectives, both adjs after the noun
|
||||
# une personne intelligente et drôle -> une personne intelligente et drôle
|
||||
(
|
||||
["une", "personne", "intelligente", "et", "drôle"],
|
||||
[1, 1, 1, 4, 2],
|
||||
["det", "ROOT", "amod", "cc", "conj"],
|
||||
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
|
||||
[(0,5)]
|
||||
),
|
||||
# relative pronoun
|
||||
# un bus qui va au ville -> un bus, qui, ville
|
||||
(
|
||||
['un', 'bus', 'qui', 'va', 'au', 'ville'],
|
||||
[1, 1, 3, 1, 5, 3],
|
||||
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
|
||||
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
|
||||
[(0,2), (2,3), (5,6)]
|
||||
),
|
||||
# relative subclause
|
||||
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
|
||||
(
|
||||
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
|
||||
[0, 2, 0, 5, 5, 2, 5],
|
||||
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
|
||||
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
|
||||
[(1,3), (4,5)]
|
||||
),
|
||||
# Person name and title by flat
|
||||
# Louis XIV -> Louis XIV
|
||||
(
|
||||
["Louis", "XIV"],
|
||||
[0, 0],
|
||||
["ROOT", "flat:name"],
|
||||
["PROPN", "PROPN"],
|
||||
[(0,2)]
|
||||
),
|
||||
# Organization name by flat
|
||||
# Nations Unies -> Nations Unies
|
||||
(
|
||||
["Nations", "Unies"],
|
||||
[0, 0],
|
||||
["ROOT", "flat:name"],
|
||||
["PROPN", "PROPN"],
|
||||
[(0,2)]
|
||||
),
|
||||
# Noun compound, person name created by two flats
|
||||
# Louise de Bratagne -> Louise de Bratagne
|
||||
(
|
||||
["Louise", "de", "Bratagne"],
|
||||
[0, 0, 0],
|
||||
["ROOT", "flat:name", "flat:name"],
|
||||
["PROPN", "PROPN", "PROPN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# Noun compound, person name created by two flats
|
||||
# Louis François Joseph -> Louis François Joseph
|
||||
(
|
||||
["Louis", "François", "Joseph"],
|
||||
[0, 0, 0],
|
||||
["ROOT", "flat:name", "flat:name"],
|
||||
["PROPN", "PROPN", "PROPN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# one determiner + one noun + one adjective qualified by an adverb
|
||||
# quelques agriculteurs très riches -> quelques agriculteurs très riches
|
||||
(
|
||||
["quelques", "agriculteurs", "très", "riches"],
|
||||
[1, 1, 3, 1],
|
||||
['det', 'ROOT', 'advmod', 'amod'],
|
||||
['DET', 'NOUN', 'ADV', 'ADJ'],
|
||||
[(0,4)]
|
||||
),
|
||||
# Two NPs conjuncted
|
||||
# Il a un chien et un chat -> Il, un chien, un chat
|
||||
(
|
||||
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
|
||||
[1, 1, 3, 1, 6, 6, 3],
|
||||
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
|
||||
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
|
||||
[(0,1), (2,4), (5,7)]
|
||||
|
||||
),
|
||||
# Two NPs together
|
||||
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
|
||||
(
|
||||
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
|
||||
[1, 1, 1, 1, 3],
|
||||
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
|
||||
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
|
||||
[(0, 3), (3, 5)]
|
||||
),
|
||||
# nmod relation between NPs
|
||||
# la destruction de la ville -> la destruction, la ville
|
||||
(
|
||||
['la', 'destruction', 'de', 'la', 'ville'],
|
||||
[1, 1, 4, 4, 1],
|
||||
['det', 'ROOT', 'case', 'det', 'nmod'],
|
||||
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
|
||||
[(0,2), (3,5)]
|
||||
),
|
||||
# nmod relation between NPs
|
||||
# Archiduchesse d’Autriche -> Archiduchesse, Autriche
|
||||
(
|
||||
['Archiduchesse', 'd’', 'Autriche'],
|
||||
[0, 2, 0],
|
||||
['ROOT', 'case', 'nmod'],
|
||||
['NOUN', 'ADP', 'PROPN'],
|
||||
[(0,1), (2,3)]
|
||||
),
|
||||
# Compounding by nmod, several NPs chained together
|
||||
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
|
||||
(
|
||||
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
|
||||
[2, 2, 2, 4, 2, 6, 2],
|
||||
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||||
[(0, 3), (4, 5), (6, 7)]
|
||||
),
|
||||
# several NPs
|
||||
# Traduction du rapport de Susana -> Traduction, rapport, Susana
|
||||
(
|
||||
['Traduction', 'du', 'raport', 'de', 'Susana'],
|
||||
[0, 2, 0, 4, 2],
|
||||
['ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||||
[(0,1), (2,3), (4,5)]
|
||||
|
||||
),
|
||||
# Several NPs
|
||||
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
|
||||
(
|
||||
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
|
||||
[2, 2, 2, 4, 2, 7, 7, 2],
|
||||
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
|
||||
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
|
||||
[(0,3), (4,5), (6,8)]
|
||||
),
|
||||
# Passive subject
|
||||
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
|
||||
(
|
||||
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
|
||||
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
|
||||
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
|
||||
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
|
||||
[(0, 3), (6, 10), (11, 12)]
|
||||
)
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
|
||||
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||
doc = fr_tokenizer("Je suis allé à l'école")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
221
spacy/tests/lang/it/test_noun_chunks.py
Normal file
221
spacy/tests/lang/it/test_noun_chunks.py
Normal file
|
@ -0,0 +1,221 @@
|
|||
from spacy.tokens import Doc
|
||||
import pytest
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.parametrize(
|
||||
"words,heads,deps,pos,chunk_offsets",
|
||||
[
|
||||
# determiner + noun
|
||||
# un pollo -> un pollo
|
||||
(
|
||||
["un", "pollo"],
|
||||
[1, 1],
|
||||
["det", "ROOT"],
|
||||
["DET", "NOUN"],
|
||||
[(0,2)],
|
||||
),
|
||||
# two determiners + noun
|
||||
# il mio cane -> il mio cane
|
||||
(
|
||||
["il", "mio", "cane"],
|
||||
[2, 2, 2],
|
||||
["det", "det:poss", "ROOT"],
|
||||
["DET", "DET", "NOUN"],
|
||||
[(0,3)],
|
||||
),
|
||||
# two determiners, one is after noun. rare usage but still testing
|
||||
# il cane mio-> il cane mio
|
||||
(
|
||||
["il", "cane", "mio"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "det:poss"],
|
||||
["DET", "NOUN", "DET"],
|
||||
[(0,3)],
|
||||
),
|
||||
# relative pronoun
|
||||
# È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty.
|
||||
(
|
||||
["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
|
||||
[2, 2, 2, 4, 2, 7, 7, 4],
|
||||
['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
|
||||
['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
|
||||
[(3,5), (5,6)]
|
||||
),
|
||||
# relative subclause
|
||||
# il computer che hai comprato -> il computer, che the computer that you bought
|
||||
(
|
||||
['il', 'computer', 'che', 'hai', 'comprato'],
|
||||
[1, 1, 4, 4, 1],
|
||||
['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
|
||||
['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
|
||||
[(0,2), (2,3)]
|
||||
),
|
||||
# det + noun + adj
|
||||
# Una macchina grande -> Una macchina grande
|
||||
(
|
||||
["Una", "macchina", "grande"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "amod"],
|
||||
["DET", "NOUN", "ADJ"],
|
||||
[(0,3)],
|
||||
),
|
||||
# noun + adj plural
|
||||
# mucche bianche
|
||||
(
|
||||
["mucche", "bianche"],
|
||||
[0, 0],
|
||||
["ROOT", "amod"],
|
||||
["NOUN", "ADJ"],
|
||||
[(0,2)],
|
||||
),
|
||||
# det + adj + noun
|
||||
# Una grande macchina -> Una grande macchina
|
||||
(
|
||||
['Una', 'grande', 'macchina'],
|
||||
[2, 2, 2],
|
||||
["det", "amod", "ROOT"],
|
||||
["DET", "ADJ", "NOUN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# det + adj + noun, det with apostrophe
|
||||
# un'importante associazione -> un'importante associazione
|
||||
(
|
||||
["Un'", 'importante', 'associazione'],
|
||||
[2, 2, 2],
|
||||
["det", "amod", "ROOT"],
|
||||
["DET", "ADJ", "NOUN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# multiple adjectives
|
||||
# Un cane piccolo e marrone -> Un cane piccolo e marrone
|
||||
(
|
||||
["Un", "cane", "piccolo", "e", "marrone"],
|
||||
[1, 1, 1, 4, 2],
|
||||
["det", "ROOT", "amod", "cc", "conj"],
|
||||
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
|
||||
[(0,5)]
|
||||
),
|
||||
# determiner, adjective, compound created by flat
|
||||
# le Nazioni Unite -> le Nazioni Unite
|
||||
(
|
||||
["le", "Nazioni", "Unite"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "flat:name"],
|
||||
["DET", "PROPN", "PROPN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# one determiner + one noun + one adjective qualified by an adverb
|
||||
# alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers
|
||||
(
|
||||
['alcuni', 'contadini', 'molto', 'ricchi'],
|
||||
[1, 1, 3, 1],
|
||||
['det', 'ROOT', 'advmod', 'amod'],
|
||||
['DET', 'NOUN', 'ADV', 'ADJ'],
|
||||
[(0,4)]
|
||||
),
|
||||
# Two NPs conjuncted
|
||||
# Ho un cane e un gatto -> un cane, un gatto
|
||||
(
|
||||
['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
|
||||
[0, 2, 0, 5, 5, 0],
|
||||
['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
|
||||
['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
|
||||
[(1,3), (4,6)]
|
||||
|
||||
),
|
||||
# Two NPs together
|
||||
# lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
|
||||
(
|
||||
['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
|
||||
[1, 1, 1, 1, 3],
|
||||
['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
|
||||
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
|
||||
[(0, 3), (3, 5)]
|
||||
),
|
||||
# Noun compound, person name and titles
|
||||
# Dom Pedro II -> Dom Pedro II
|
||||
(
|
||||
["Dom", "Pedro", "II"],
|
||||
[0, 0, 0],
|
||||
["ROOT", "flat:name", "flat:name"],
|
||||
["PROPN", "PROPN", "PROPN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# Noun compound created by flat
|
||||
# gli Stati Uniti
|
||||
(
|
||||
["gli", "Stati", "Uniti"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "flat:name"],
|
||||
["DET", "PROPN", "PROPN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# nmod relation between NPs
|
||||
# la distruzione della città -> la distruzione, città
|
||||
(
|
||||
['la', 'distruzione', 'della', 'città'],
|
||||
[1, 1, 3, 1],
|
||||
['det', 'ROOT', 'case', 'nmod'],
|
||||
['DET', 'NOUN', 'ADP', 'NOUN'],
|
||||
[(0,2), (3,4)]
|
||||
),
|
||||
# Compounding by nmod, several NPs chained together
|
||||
# la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
|
||||
(
|
||||
["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
|
||||
[2, 2, 2, 4, 2, 6, 2],
|
||||
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||||
[(0, 3), (4, 5), (6, 7)]
|
||||
),
|
||||
# several NPs
|
||||
# Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
|
||||
(
|
||||
['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
|
||||
[0, 2, 0, 4, 2],
|
||||
['ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||||
[(0,1), (2,3), (4,5)]
|
||||
|
||||
),
|
||||
# Several NPs
|
||||
# Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
|
||||
(
|
||||
['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
|
||||
[1, 1, 1, 4, 1, 8, 8, 8, 1],
|
||||
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
|
||||
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
|
||||
[(0,3), (4,5), (6,9)]
|
||||
),
|
||||
# Passive subject
|
||||
# La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton
|
||||
(
|
||||
['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
|
||||
[2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
|
||||
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
|
||||
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||||
[(0, 3), (6, 8), (9, 10), (11,12)]
|
||||
),
|
||||
# Misc
|
||||
# Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
|
||||
(
|
||||
['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
|
||||
[15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
|
||||
['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
|
||||
['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||||
[(2,4), (9,12), (13,14), (17,18), (19,20)]
|
||||
)
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
|
||||
doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed_it(it_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
|
||||
doc = it_tokenizer("Sei andato a Oxford")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
Loading…
Reference in New Issue
Block a user