Merge branch 'develop' into feature/master_copy

This commit is contained in:
Sofie Van Landeghem 2022-01-20 13:36:17 +01:00 committed by GitHub
commit 4465fe0306
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 902 additions and 266 deletions

View File

@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
_hangul_jamo = r"\u1100-\u11FF" _hangul_jamo = r"\u1100-\u11FF"
_hangul = _hangul_syllables + _hangul_jamo _hangul = _hangul_syllables + _hangul_jamo
_hiragana = r"\u3040-\u309F"
_katakana = r"\u30A0-\u30FFー"
_kana = _hiragana + _katakana
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
_latin_u_extendedA = ( _latin_u_extendedA = (
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@ -244,6 +248,7 @@ _uncased = (
+ _tamil + _tamil
+ _telugu + _telugu
+ _hangul + _hangul
+ _kana
+ _cjk + _cjk
) )

View File

@ -6,16 +6,35 @@ from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span.""" """
# fmt: off Detect base noun phrases from a dependency parse. Works on both Doc and Span.
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] """
# fmt: on labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"obl:arg",
"obl:mod",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"): if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = {doc.vocab.strings.add(label) for label in labels}
conj = doc.vocab.strings.add("conj") np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_label = doc.vocab.strings.add("det")
det_pos = doc.vocab.strings.add("DET")
adp_pos = doc.vocab.strings.add("ADP")
conj_label = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1 prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
if word.left_edge.i <= prev_end: if word.left_edge.i <= prev_end:
continue continue
if word.dep in np_deps: if word.dep in np_deps:
prev_end = word.right_edge.i right_childs = list(word.rights)
yield word.left_edge.i, word.right_edge.i + 1, np_label right_child = right_childs[0] if right_childs else None
elif word.dep == conj:
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep == det_label and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_pos else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj_label:
head = word.head head = word.head
while head.dep == conj and head.head.i < head.i: while head.dep == conj_label and head.head.i < head.i:
head = head.head head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps: if head.dep in np_deps:
prev_end = word.right_edge.i prev_end = word.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from .lemmatizer import ItalianLemmatizer from .lemmatizer import ItalianLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS
class ItalianDefaults(BaseDefaults): class ItalianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
class Italian(Language): class Italian(Language):

View File

@ -10,18 +10,18 @@ avresti avrete avrà avrò avuta avute avuti avuto
basta bene benissimo brava bravo basta bene benissimo brava bravo
casa caso cento certa certe certi certo che chi chicchessia chiunque ci casa caso cento certa certe certi certo che chi chicchessia chiunque ci c'
ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto
cogli coi col colei coll coloro colui come cominci comunque con concernente cogli coi col colei coll coloro colui come cominci comunque con concernente
conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui
da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli d' da dagl dagli dai dal dall dall' dalla dalle dallo dappertutto davanti degl degli
dei del dell della delle dello dentro detto deve di dice dietro dire dei del dell dell' della delle dello dentro detto deve di dice dietro dire
dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due
dunque durante dunque durante
ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era e ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
erano eravamo eravate eri ero esempio esse essendo esser essere essi ex erano eravamo eravate eri ero esempio esse essendo esser essere essi ex è
fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero
facessi facessimo faceste facesti faceva facevamo facevano facevate facevi facessi facessimo faceste facesti faceva facevamo facevano facevate facevi
@ -30,21 +30,21 @@ fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente
finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra
frattempo fu fui fummo fuori furono futuro generale frattempo fu fui fummo fuori furono futuro generale
gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo gia già giacche giorni giorno gli gl' gliela gliele glieli glielo gliene governo
grande grazie gruppo grande grazie gruppo
ha haha hai hanno ho ha haha hai hanno ho
ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io
la lasciato lato lavoro le lei li lo lontano loro lui lungo luogo l' la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
ma macche magari maggior mai male malgrado malissimo mancanza marche me m' ma macche magari maggior mai male malgrado malissimo mancanza marche me
medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi
milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto
nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun nessun'
nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre nessuna nessuno nient' niente no noi non nondimeno nonostante nonsia nostra nostre
nostri nostro novanta nove nulla nuovo nostri nostro novanta nove nulla nuovo
od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto
@ -56,12 +56,12 @@ potrebbe preferibilmente presa press prima primo principalmente probabilmente
proprio puo può pure purtroppo proprio puo può pure purtroppo
qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante
quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest quanti quanto quantunque quasi quattro quel quel' quella quelle quelli quello quest quest'
questa queste questi questo qui quindi questa queste questi questo qui quindi
realmente recente recentemente registrazione relativo riecco salvo realmente recente recentemente registrazione relativo riecco salvo
sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste s' sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei
sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate
siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando
@ -72,12 +72,12 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
sullo suo suoi sullo suo suoi
tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta t' tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
uguali ulteriore ultimo un una uno uomo uguali ulteriore ultimo un un' una uno uomo
va vale vari varia varie vario verso vi via vicino visto vita voi volta volte v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
vostra vostre vostri vostro vostra vostre vostri vostro
""".split() """.split()
) )

View File

@ -0,0 +1,86 @@
from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
dets = ["det", "det:poss"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_labels = {doc.vocab.strings.add(det) for det in dets}
det_pos = doc.vocab.strings.add("DET")
adp_label = doc.vocab.strings.add("ADP")
conj = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep in det_labels and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_label else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av
bak bare bedre beste blant ble bli blir blitt bris by både bak bare bedre beste blant ble bli blir blitt bris by både
da dag de del dem den denne der dermed det dette disse drept du da dag de del dem den denne der dermed det dette disse du
eller en enn er et ett etter eller en enn er et ett etter
fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag fem fikk fire fjor flere folk for fortsatt fra fram
funnet får fått før først første funnet får fått før først første
gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn går gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn går
ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan ha hadde ham han hans har hele helt henne hennes her hun
hvorfor
i ifølge igjen ikke ingen inn i ifølge igjen ikke ingen inn
ja jeg ja jeg
kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
kvinner
la laget land landet langt leder ligger like litt løpet lørdag la laget land landet langt leder ligger like litt løpet
man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer man mange med meg mellom men mener mennesker mens mer mot mye mål måtte
millioner minutter mot msci mye mål måtte
ned neste noe noen nok norge norsk norske ntb ny nye når ned neste noe noen nok ny nye når
og også om onsdag opp opplyser oslo oss over og også om opp opplyser oss over
personer plass poeng politidistrikt politiet president prosent på personer plass poeng på
regjeringen runde rundt russland runde rundt
sa saken samme sammen samtidig satt se seg seks selv senere september ser sett sa saken samme sammen samtidig satt se seg seks selv senere ser sett
siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
store står sverige svært søndag store står svært
ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror ta tatt tid tidligere til tilbake tillegg tok tror
tyskland
under usa ut uten utenfor under ut uten utenfor
vant var ved veldig vi videre viktig vil ville viser vår være vært vant var ved veldig vi videre viktig vil ville viser vår være vært

View File

@ -1,13 +1,10 @@
# Source: https://github.com/stopwords-iso/stopwords-sl # Source: https://github.com/stopwords-iso/stopwords-sl
# TODO: probably needs to be tidied up the list seems to have month names in # Removed various words that are not normally considered stop words, such as months.
# it, which shouldn't be considered stop words.
STOP_WORDS = set( STOP_WORDS = set(
""" """
a a
ali ali
april
avgust
b b
bi bi
bil bil
@ -19,7 +16,6 @@ biti
blizu blizu
bo bo
bodo bodo
bojo
bolj bolj
bom bom
bomo bomo
@ -37,16 +33,6 @@ da
daleč daleč
dan dan
danes danes
datum
december
deset
deseta
deseti
deseto
devet
deveta
deveti
deveto
do do
dober dober
dobra dobra
@ -54,16 +40,7 @@ dobri
dobro dobro
dokler dokler
dol dol
dolg
dolga
dolgi
dovolj dovolj
drug
druga
drugi
drugo
dva
dve
e e
eden eden
en en
@ -74,7 +51,6 @@ enkrat
eno eno
etc. etc.
f f
februar
g g
g. g.
ga ga
@ -93,16 +69,12 @@ iv
ix ix
iz iz
j j
januar
jaz jaz
je je
ji ji
jih jih
jim jim
jo jo
julij
junij
jutri
k k
kadarkoli kadarkoli
kaj kaj
@ -123,41 +95,23 @@ kje
kjer kjer
kjerkoli kjerkoli
ko ko
koder
koderkoli koderkoli
koga koga
komu komu
kot kot
kratek
kratka
kratke
kratki
l l
lahka
lahke
lahki
lahko
le le
lep lep
lepa lepa
lepe lepe
lepi lepi
lepo lepo
leto
m m
maj
majhen
majhna
majhni
malce
malo
manj manj
marec
me me
med med
medtem medtem
mene mene
mesec
mi mi
midva midva
midve midve
@ -183,7 +137,6 @@ najmanj
naju naju
največ največ
nam nam
narobe
nas nas
nato nato
nazaj nazaj
@ -192,7 +145,6 @@ naša
naše naše
ne ne
nedavno nedavno
nedelja
nek nek
neka neka
nekaj nekaj
@ -236,7 +188,6 @@ njuna
njuno njuno
no no
nocoj nocoj
november
npr. npr.
o o
ob ob
@ -244,51 +195,23 @@ oba
obe obe
oboje oboje
od od
odprt
odprta
odprti
okoli okoli
oktober
on on
onadva onadva
one one
oni oni
onidve onidve
osem
osma
osmi
osmo
oz. oz.
p p
pa pa
pet
peta
petek
peti
peto
po po
pod pod
pogosto pogosto
poleg poleg
poln
polna
polni
polno
ponavadi ponavadi
ponedeljek
ponovno ponovno
potem potem
povsod povsod
pozdravljen
pozdravljeni
prav
prava
prave
pravi
pravo
prazen
prazna
prazno
prbl. prbl.
precej precej
pred pred
@ -297,19 +220,10 @@ preko
pri pri
pribl. pribl.
približno približno
primer
pripravljen
pripravljena
pripravljeni
proti proti
prva
prvi
prvo
r r
ravno
redko redko
res res
reč
s s
saj saj
sam sam
@ -321,29 +235,17 @@ se
sebe sebe
sebi sebi
sedaj sedaj
sedem
sedma
sedmi
sedmo
sem sem
september
seveda seveda
si si
sicer sicer
skoraj skoraj
skozi skozi
slab
smo smo
so so
sobota
spet spet
sreda
srednja
srednji
sta sta
ste ste
stran
stvar
sva sva
t t
ta ta
@ -358,10 +260,6 @@ te
tebe tebe
tebi tebi
tega tega
težak
težka
težki
težko
ti ti
tista tista
tiste tiste
@ -371,11 +269,6 @@ tj.
tja tja
to to
toda toda
torek
tretja
tretje
tretji
tri
tu tu
tudi tudi
tukaj tukaj
@ -392,10 +285,6 @@ vaša
vaše vaše
ve ve
vedno vedno
velik
velika
veliki
veliko
vendar vendar
ves ves
več več
@ -403,10 +292,6 @@ vi
vidva vidva
vii vii
viii viii
visok
visoka
visoke
visoki
vsa vsa
vsaj vsaj
vsak vsak
@ -420,34 +305,21 @@ vsega
vsi vsi
vso vso
včasih včasih
včeraj
x x
z z
za za
zadaj zadaj
zadnji zadnji
zakaj zakaj
zaprta
zaprti
zaprto
zdaj zdaj
zelo zelo
zunaj zunaj
č č
če če
često često
četrta
četrtek
četrti
četrto
čez čez
čigav čigav
š š
šest
šesta
šesti
šesto
štiri
ž ž
že že
""".split() """.split()

View File

@ -3,6 +3,7 @@ from libc.string cimport memcpy, memset
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
cimport libcpp cimport libcpp
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp.set cimport set from libcpp.set cimport set
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
@ -30,8 +31,8 @@ cdef cppclass StateC:
vector[int] _stack vector[int] _stack
vector[int] _rebuffer vector[int] _rebuffer
vector[SpanC] _ents vector[SpanC] _ents
vector[ArcC] _left_arcs unordered_map[int, vector[ArcC]] _left_arcs
vector[ArcC] _right_arcs unordered_map[int, vector[ArcC]] _right_arcs
vector[libcpp.bool] _unshiftable vector[libcpp.bool] _unshiftable
set[int] _sent_starts set[int] _sent_starts
TokenC _empty_token TokenC _empty_token
@ -160,15 +161,22 @@ cdef cppclass StateC:
else: else:
return &this._sent[i] return &this._sent[i]
void get_arcs(vector[ArcC]* arcs) nogil const: void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) nogil const:
for i in range(this._left_arcs.size()): cdef const vector[ArcC]* arcs
arc = this._left_arcs.at(i) head_arcs_it = heads_arcs.const_begin()
while head_arcs_it != heads_arcs.const_end():
arcs = &deref(head_arcs_it).second
arcs_it = arcs.const_begin()
while arcs_it != arcs.const_end():
arc = deref(arcs_it)
if arc.head != -1 and arc.child != -1: if arc.head != -1 and arc.child != -1:
arcs.push_back(arc) out.push_back(arc)
for i in range(this._right_arcs.size()): incr(arcs_it)
arc = this._right_arcs.at(i) incr(head_arcs_it)
if arc.head != -1 and arc.child != -1:
arcs.push_back(arc) void get_arcs(vector[ArcC]* out) nogil const:
this.map_get_arcs(this._left_arcs, out)
this.map_get_arcs(this._right_arcs, out)
int H(int child) nogil const: int H(int child) nogil const:
if child >= this.length or child < 0: if child >= this.length or child < 0:
@ -182,37 +190,35 @@ cdef cppclass StateC:
else: else:
return this._ents.back().start return this._ents.back().start
int L(int head, int idx) nogil const: int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) nogil const:
if idx < 1 or this._left_arcs.size() == 0: if idx < 1:
return -1 return -1
# Work backwards through left-arcs to find the arc at the head_arcs_it = heads_arcs.const_find(head)
if head_arcs_it == heads_arcs.const_end():
return -1
cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
# Work backwards through arcs to find the arc at the
# requested index more quickly. # requested index more quickly.
cdef size_t child_index = 0 cdef size_t child_index = 0
it = this._left_arcs.const_rbegin() arcs_it = arcs.const_rbegin()
while it != this._left_arcs.rend(): while arcs_it != arcs.const_rend() and child_index != idx:
arc = deref(it) arc = deref(arcs_it)
if arc.head == head and arc.child != -1 and arc.child < head: if arc.child != -1:
child_index += 1 child_index += 1
if child_index == idx: if child_index == idx:
return arc.child return arc.child
incr(it) incr(arcs_it)
return -1 return -1
int L(int head, int idx) nogil const:
return this.nth_child(this._left_arcs, head, idx)
int R(int head, int idx) nogil const: int R(int head, int idx) nogil const:
if idx < 1 or this._right_arcs.size() == 0: return this.nth_child(this._right_arcs, head, idx)
return -1
cdef vector[int] rights
for i in range(this._right_arcs.size()):
arc = this._right_arcs.at(i)
if arc.head == head and arc.child != -1 and arc.child > head:
rights.push_back(arc.child)
idx = (<int>rights.size()) - idx
if idx < 0:
return -1
else:
return rights.at(idx)
bint empty() nogil const: bint empty() nogil const:
return this._stack.size() == 0 return this._stack.size() == 0
@ -254,22 +260,29 @@ cdef cppclass StateC:
int r_edge(int word) nogil const: int r_edge(int word) nogil const:
return word return word
int n_L(int head) nogil const: int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) nogil const:
cdef int n = 0 cdef int n = 0
for i in range(this._left_arcs.size()): head_arcs_it = heads_arcs.const_find(head)
arc = this._left_arcs.at(i) if head_arcs_it == heads_arcs.const_end():
if arc.head == head and arc.child != -1 and arc.child < arc.head:
n += 1
return n return n
int n_R(int head) nogil const: cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
cdef int n = 0 arcs_it = arcs.const_begin()
for i in range(this._right_arcs.size()): while arcs_it != arcs.end():
arc = this._right_arcs.at(i) arc = deref(arcs_it)
if arc.head == head and arc.child != -1 and arc.child > arc.head: if arc.child != -1:
n += 1 n += 1
incr(arcs_it)
return n return n
int n_L(int head) nogil const:
return n_arcs(this._left_arcs, head)
int n_R(int head) nogil const:
return n_arcs(this._right_arcs, head)
bint stack_is_connected() nogil const: bint stack_is_connected() nogil const:
return False return False
@ -328,19 +341,20 @@ cdef cppclass StateC:
arc.child = child arc.child = child
arc.label = label arc.label = label
if head > child: if head > child:
this._left_arcs.push_back(arc) this._left_arcs[arc.head].push_back(arc)
else: else:
this._right_arcs.push_back(arc) this._right_arcs[arc.head].push_back(arc)
this._heads[child] = head this._heads[child] = head
void del_arc(int h_i, int c_i) nogil: void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
cdef vector[ArcC]* arcs arcs_it = heads_arcs.find(h_i)
if h_i > c_i: if arcs_it == heads_arcs.end():
arcs = &this._left_arcs return
else:
arcs = &this._right_arcs arcs = &deref(arcs_it).second
if arcs.size() == 0: if arcs.size() == 0:
return return
arc = arcs.back() arc = arcs.back()
if arc.head == h_i and arc.child == c_i: if arc.head == h_i and arc.child == c_i:
arcs.pop_back() arcs.pop_back()
@ -353,6 +367,12 @@ cdef cppclass StateC:
arc.label = 0 arc.label = 0
break break
void del_arc(int h_i, int c_i) nogil:
if h_i > c_i:
this.map_del_arc(&this._left_arcs, h_i, c_i)
else:
this.map_del_arc(&this._right_arcs, h_i, c_i)
SpanC get_ent() nogil const: SpanC get_ent() nogil const:
cdef SpanC ent cdef SpanC ent
if this._ents.size() == 0: if this._ents.size() == 0:

View File

@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from itertools import islice
from typing import Optional, Callable from typing import Optional, Callable
from itertools import islice
import srsly import srsly
from thinc.api import Model, SequenceCategoricalCrossentropy, Config from thinc.api import Model, SequenceCategoricalCrossentropy, Config

View File

@ -1,9 +1,10 @@
import numpy
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
from thinc.api import Optimizer from thinc.api import Optimizer
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
import numpy
from ..compat import Protocol, runtime_checkable from ..compat import Protocol, runtime_checkable
from ..scorer import Scorer from ..scorer import Scorer
from ..language import Language from ..language import Language

View File

@ -1,8 +1,8 @@
from itertools import islice
from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any
from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
from thinc.types import Floats2d from thinc.types import Floats2d
import numpy import numpy
from itertools import islice
from .trainable_pipe import TrainablePipe from .trainable_pipe import TrainablePipe
from ..language import Language from ..language import Language
@ -158,6 +158,13 @@ class TextCategorizer(TrainablePipe):
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.scorer = scorer self.scorer = scorer
@property
def support_missing_values(self):
# There are no missing values as the textcat should always
# predict exactly one label. All other labels are 0.0
# Subclasses may override this property to change internal behaviour.
return False
@property @property
def labels(self) -> Tuple[str]: def labels(self) -> Tuple[str]:
"""RETURNS (Tuple[str]): The labels currently added to the component. """RETURNS (Tuple[str]): The labels currently added to the component.
@ -294,7 +301,7 @@ class TextCategorizer(TrainablePipe):
for j, label in enumerate(self.labels): for j, label in enumerate(self.labels):
if label in eg.reference.cats: if label in eg.reference.cats:
truths[i, j] = eg.reference.cats[label] truths[i, j] = eg.reference.cats[label]
else: elif self.support_missing_values:
not_missing[i, j] = 0.0 not_missing[i, j] = 0.0
truths = self.model.ops.asarray(truths) # type: ignore truths = self.model.ops.asarray(truths) # type: ignore
return truths, not_missing # type: ignore return truths, not_missing # type: ignore
@ -313,9 +320,9 @@ class TextCategorizer(TrainablePipe):
self._validate_categories(examples) self._validate_categories(examples)
truths, not_missing = self._examples_to_truth(examples) truths, not_missing = self._examples_to_truth(examples)
not_missing = self.model.ops.asarray(not_missing) # type: ignore not_missing = self.model.ops.asarray(not_missing) # type: ignore
d_scores = (scores - truths) / scores.shape[0] d_scores = (scores - truths)
d_scores *= not_missing d_scores *= not_missing
mean_square_error = (d_scores ** 2).sum(axis=1).mean() mean_square_error = (d_scores ** 2).mean()
return float(mean_square_error), d_scores return float(mean_square_error), d_scores
def add_label(self, label: str) -> int: def add_label(self, label: str) -> int:

View File

@ -1,8 +1,8 @@
from itertools import islice
from typing import Iterable, Optional, Dict, List, Callable, Any from typing import Iterable, Optional, Dict, List, Callable, Any
from thinc.api import Model, Config
from thinc.types import Floats2d from thinc.types import Floats2d
from thinc.api import Model, Config
from itertools import islice
from ..language import Language from ..language import Language
from ..training import Example, validate_get_examples from ..training import Example, validate_get_examples
@ -158,6 +158,10 @@ class MultiLabel_TextCategorizer(TextCategorizer):
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.scorer = scorer self.scorer = scorer
@property
def support_missing_values(self):
return True
def initialize( # type: ignore[override] def initialize( # type: ignore[override]
self, self,
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],

View File

@ -445,7 +445,8 @@ class Scorer:
getter(doc, attr) should return the values for the individual doc. getter(doc, attr) should return the values for the individual doc.
labels (Iterable[str]): The set of possible labels. Defaults to []. labels (Iterable[str]): The set of possible labels. Defaults to [].
multi_label (bool): Whether the attribute allows multiple labels. multi_label (bool): Whether the attribute allows multiple labels.
Defaults to True. Defaults to True. When set to False (exclusive labels), missing
gold labels are interpreted as 0.0.
positive_label (str): The positive label for a binary task with positive_label (str): The positive label for a binary task with
exclusive classes. Defaults to None. exclusive classes. Defaults to None.
threshold (float): Cutoff to consider a prediction "positive". Defaults threshold (float): Cutoff to consider a prediction "positive". Defaults
@ -484,13 +485,15 @@ class Scorer:
for label in labels: for label in labels:
pred_score = pred_cats.get(label, 0.0) pred_score = pred_cats.get(label, 0.0)
gold_score = gold_cats.get(label, 0.0) gold_score = gold_cats.get(label)
if not gold_score and not multi_label:
gold_score = 0.0
if gold_score is not None: if gold_score is not None:
auc_per_type[label].score_set(pred_score, gold_score) auc_per_type[label].score_set(pred_score, gold_score)
if multi_label: if multi_label:
for label in labels: for label in labels:
pred_score = pred_cats.get(label, 0.0) pred_score = pred_cats.get(label, 0.0)
gold_score = gold_cats.get(label, 0.0) gold_score = gold_cats.get(label)
if gold_score is not None: if gold_score is not None:
if pred_score >= threshold and gold_score > 0: if pred_score >= threshold and gold_score > 0:
f_per_type[label].tp += 1 f_per_type[label].tp += 1
@ -502,7 +505,6 @@ class Scorer:
# Get the highest-scoring for each. # Get the highest-scoring for each.
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1]) gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
if gold_score is not None:
if pred_label == gold_label and pred_score >= threshold: if pred_label == gold_label and pred_score >= threshold:
f_per_type[pred_label].tp += 1 f_per_type[pred_label].tp += 1
else: else:
@ -511,7 +513,7 @@ class Scorer:
f_per_type[pred_label].fp += 1 f_per_type[pred_label].fp += 1
elif gold_cats: elif gold_cats:
gold_label, gold_score = max(gold_cats, key=lambda it: it[1]) gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
if gold_score is not None and gold_score > 0: if gold_score > 0:
f_per_type[gold_label].fn += 1 f_per_type[gold_label].fn += 1
elif pred_cats: elif pred_cats:
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])

View File

@ -155,6 +155,11 @@ def fr_tokenizer():
return get_lang_class("fr")().tokenizer return get_lang_class("fr")().tokenizer
@pytest.fixture(scope="session")
def fr_vocab():
return get_lang_class("fr")().vocab
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ga_tokenizer(): def ga_tokenizer():
return get_lang_class("ga")().tokenizer return get_lang_class("ga")().tokenizer
@ -205,6 +210,11 @@ def it_tokenizer():
return get_lang_class("it")().tokenizer return get_lang_class("it")().tokenizer
@pytest.fixture(scope="session")
def it_vocab():
return get_lang_class("it")().vocab
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ja_tokenizer(): def ja_tokenizer():
pytest.importorskip("sudachipy") pytest.importorskip("sudachipy")

View File

@ -573,6 +573,55 @@ def test_span_with_vectors(doc):
doc.vocab.vectors = prev_vectors doc.vocab.vectors = prev_vectors
# fmt: off
def test_span_comparison(doc):
# Identical start, end, only differ in label and kb_id
assert Span(doc, 0, 3) == Span(doc, 0, 3)
assert Span(doc, 0, 3, "LABEL") == Span(doc, 0, 3, "LABEL")
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") == Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL")
assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3, "LABEL") != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3) <= Span(doc, 0, 3) and Span(doc, 0, 3) >= Span(doc, 0, 3)
assert Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL") and Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "LABEL")
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert (Span(doc, 0, 3) < Span(doc, 0, 3, "", kb_id="KB_ID") < Span(doc, 0, 3, "LABEL") < Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
assert (Span(doc, 0, 3) <= Span(doc, 0, 3, "", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") > Span(doc, 0, 3, "LABEL") > Span(doc, 0, 3, "", kb_id="KB_ID") > Span(doc, 0, 3))
assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "", kb_id="KB_ID") >= Span(doc, 0, 3))
# Different end
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4)
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 4)
assert Span(doc, 0, 4) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 4) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
# Different start
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
assert Span(doc, 1, 3) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 1, 3) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
# Different start & different end
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
assert Span(doc, 1, 3) > Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
assert Span(doc, 1, 3) >= Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
# fmt: on
@pytest.mark.parametrize( @pytest.mark.parametrize(
"start,end,expected_sentences,expected_sentences_with_hook", "start,end,expected_sentences,expected_sentences_with_hook",
[ [

View File

@ -1,8 +1,230 @@
from spacy.tokens import Doc
import pytest import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un nom -> un nom
(
["un", "nom"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + noun starting with vowel
# l'heure -> l'heure
(
["l'", "heure"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + plural noun
# les romans -> les romans
(
["les", "romans"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# det + adj + noun
# Le vieux Londres -> Le vieux Londres
(
['Les', 'vieux', 'Londres'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + noun + adj
# le nom propre -> le nom propre a proper noun
(
["le", "nom", "propre"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# det + noun + adj plural
# Les chiens bruns -> les chiens bruns
(
["Les", "chiens", "bruns"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# multiple adjectives: one adj before the noun, one adj after the noun
# un nouveau film intéressant -> un nouveau film intéressant
(
["un", "nouveau", "film", "intéressant"],
[2, 2, 2, 2],
["det", "amod", "ROOT", "amod"],
["DET", "ADJ", "NOUN", "ADJ"],
[(0,4)]
),
# multiple adjectives, both adjs after the noun
# une personne intelligente et drôle -> une personne intelligente et drôle
(
["une", "personne", "intelligente", "et", "drôle"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# relative pronoun
# un bus qui va au ville -> un bus, qui, ville
(
['un', 'bus', 'qui', 'va', 'au', 'ville'],
[1, 1, 3, 1, 5, 3],
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
[(0,2), (2,3), (5,6)]
),
# relative subclause
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
(
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
[0, 2, 0, 5, 5, 2, 5],
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
[(1,3), (4,5)]
),
# Person name and title by flat
# Louis XIV -> Louis XIV
(
["Louis", "XIV"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Organization name by flat
# Nations Unies -> Nations Unies
(
["Nations", "Unies"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Noun compound, person name created by two flats
# Louise de Bratagne -> Louise de Bratagne
(
["Louise", "de", "Bratagne"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound, person name created by two flats
# Louis François Joseph -> Louis François Joseph
(
["Louis", "François", "Joseph"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# quelques agriculteurs très riches -> quelques agriculteurs très riches
(
["quelques", "agriculteurs", "très", "riches"],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Il a un chien et un chat -> Il, un chien, un chat
(
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
[1, 1, 3, 1, 6, 6, 3],
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(0,1), (2,4), (5,7)]
),
# Two NPs together
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
(
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# nmod relation between NPs
# la destruction de la ville -> la destruction, la ville
(
['la', 'destruction', 'de', 'la', 'ville'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'case', 'det', 'nmod'],
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
[(0,2), (3,5)]
),
# nmod relation between NPs
# Archiduchesse dAutriche -> Archiduchesse, Autriche
(
['Archiduchesse', 'd', 'Autriche'],
[0, 2, 0],
['ROOT', 'case', 'nmod'],
['NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3)]
),
# Compounding by nmod, several NPs chained together
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
(
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduction du rapport de Susana -> Traduction, rapport, Susana
(
['Traduction', 'du', 'raport', 'de', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
(
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
[2, 2, 2, 4, 2, 7, 7, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
[(0,3), (4,5), (6,8)]
),
# Passive subject
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
(
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
[(0, 3), (6, 10), (11, 12)]
)
],
)
# fmt: on
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_fr(fr_tokenizer): def test_noun_chunks_is_parsed_fr(fr_tokenizer):
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.""" """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
doc = fr_tokenizer("trouver des travaux antérieurs") doc = fr_tokenizer("Je suis allé à l'école")
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -0,0 +1,221 @@
from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un pollo -> un pollo
(
["un", "pollo"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0,2)],
),
# two determiners + noun
# il mio cane -> il mio cane
(
["il", "mio", "cane"],
[2, 2, 2],
["det", "det:poss", "ROOT"],
["DET", "DET", "NOUN"],
[(0,3)],
),
# two determiners, one is after noun. rare usage but still testing
# il cane mio-> il cane mio
(
["il", "cane", "mio"],
[1, 1, 1],
["det", "ROOT", "det:poss"],
["DET", "NOUN", "DET"],
[(0,3)],
),
# relative pronoun
# È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty.
(
["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
[2, 2, 2, 4, 2, 7, 7, 4],
['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
[(3,5), (5,6)]
),
# relative subclause
# il computer che hai comprato -> il computer, che the computer that you bought
(
['il', 'computer', 'che', 'hai', 'comprato'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
[(0,2), (2,3)]
),
# det + noun + adj
# Una macchina grande -> Una macchina grande
(
["Una", "macchina", "grande"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0,3)],
),
# noun + adj plural
# mucche bianche
(
["mucche", "bianche"],
[0, 0],
["ROOT", "amod"],
["NOUN", "ADJ"],
[(0,2)],
),
# det + adj + noun
# Una grande macchina -> Una grande macchina
(
['Una', 'grande', 'macchina'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + adj + noun, det with apostrophe
# un'importante associazione -> un'importante associazione
(
["Un'", 'importante', 'associazione'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# multiple adjectives
# Un cane piccolo e marrone -> Un cane piccolo e marrone
(
["Un", "cane", "piccolo", "e", "marrone"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# determiner, adjective, compound created by flat
# le Nazioni Unite -> le Nazioni Unite
(
["le", "Nazioni", "Unite"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers
(
['alcuni', 'contadini', 'molto', 'ricchi'],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Ho un cane e un gatto -> un cane, un gatto
(
['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
[0, 2, 0, 5, 5, 0],
['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(1,3), (4,6)]
),
# Two NPs together
# lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
(
['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# Noun compound, person name and titles
# Dom Pedro II -> Dom Pedro II
(
["Dom", "Pedro", "II"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound created by flat
# gli Stati Uniti
(
["gli", "Stati", "Uniti"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# nmod relation between NPs
# la distruzione della città -> la distruzione, città
(
['la', 'distruzione', 'della', 'città'],
[1, 1, 3, 1],
['det', 'ROOT', 'case', 'nmod'],
['DET', 'NOUN', 'ADP', 'NOUN'],
[(0,2), (3,4)]
),
# Compounding by nmod, several NPs chained together
# la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
(
["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
(
['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
(
['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
[1, 1, 1, 4, 1, 8, 8, 8, 1],
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
[(0,3), (4,5), (6,9)]
),
# Passive subject
# La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton
(
['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
[2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0, 3), (6, 8), (9, 10), (11,12)]
),
# Misc
# Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
(
['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
[15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(2,4), (9,12), (13,14), (17,18), (19,20)]
)
],
)
# fmt: on
def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_it(it_tokenizer):
"""Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
doc = it_tokenizer("Sei andato a Oxford")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

@ -0,0 +1,17 @@
import pytest
@pytest.mark.parametrize(
"word", ["un", "lo", "dell", "dall", "si", "ti", "mi", "quest", "quel", "quello"]
)
def test_stopwords_basic(it_tokenizer, word):
tok = it_tokenizer(word)[0]
assert tok.is_stop
@pytest.mark.parametrize(
"word", ["quest'uomo", "l'ho", "un'amica", "dell'olio", "s'arrende", "m'ascolti"]
)
def test_stopwords_elided(it_tokenizer, word):
tok = it_tokenizer(word)[0]
assert tok.is_stop

View File

@ -277,6 +277,21 @@ def test_issue7019():
print_prf_per_type(msg, scores, name="foo", type="bar") print_prf_per_type(msg, scores, name="foo", type="bar")
@pytest.mark.issue(9904)
def test_issue9904():
nlp = Language()
textcat = nlp.add_pipe("textcat")
get_examples = make_get_examples_single_label(nlp)
nlp.initialize(get_examples)
examples = get_examples()
scores = textcat.predict([eg.predicted for eg in examples])
loss = textcat.get_loss(examples, scores)[0]
loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
assert loss == pytest.approx(loss_double_bs)
@pytest.mark.skip(reason="Test is flakey when run with others") @pytest.mark.skip(reason="Test is flakey when run with others")
def test_simple_train(): def test_simple_train():
nlp = Language() nlp = Language()
@ -725,6 +740,72 @@ def test_textcat_evaluation():
assert scores["cats_micro_r"] == 4 / 6 assert scores["cats_micro_r"] == 4 / 6
@pytest.mark.parametrize(
"multi_label,spring_p",
[(True, 1 / 1), (False, 1 / 2)],
)
def test_textcat_eval_missing(multi_label: bool, spring_p: float):
"""
multi-label: the missing 'spring' in gold_doc_2 doesn't incur a penalty
exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0"""
train_examples = []
nlp = English()
ref1 = nlp("one")
ref1.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
pred1 = nlp("one")
pred1.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
train_examples.append(Example(ref1, pred1))
ref2 = nlp("two")
# reference 'spring' is missing, pred 'spring' is 1
ref2.cats = {"winter": 0.0, "summer": 0.0, "autumn": 1.0}
pred2 = nlp("two")
pred2.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
train_examples.append(Example(pred2, ref2))
scores = Scorer().score_cats(
train_examples,
"cats",
labels=["winter", "summer", "spring", "autumn"],
multi_label=multi_label,
)
assert scores["cats_f_per_type"]["spring"]["p"] == spring_p
assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 1
@pytest.mark.parametrize(
"multi_label,expected_loss",
[(True, 0), (False, 0.125)],
)
def test_textcat_loss(multi_label: bool, expected_loss: float):
"""
multi-label: the missing 'spring' in gold_doc_2 doesn't incur an increase in loss
exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss"""
train_examples = []
nlp = English()
doc1 = nlp("one")
cats1 = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0}
train_examples.append(Example.from_dict(doc1, {"cats": cats1}))
doc2 = nlp("two")
cats2 = {"winter": 0.0, "summer": 0.0, "autumn": 1.0}
train_examples.append(Example.from_dict(doc2, {"cats": cats2}))
if multi_label:
textcat = nlp.add_pipe("textcat_multilabel")
else:
textcat = nlp.add_pipe("textcat")
textcat.initialize(lambda: train_examples)
assert isinstance(textcat, TextCategorizer)
scores = textcat.model.ops.asarray(
[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0]], dtype="f" # type: ignore
)
loss, d_scores = textcat.get_loss(train_examples, scores)
assert loss == expected_loss
def test_textcat_threshold(): def test_textcat_threshold():
# Ensure the scorer can be called with a different threshold # Ensure the scorer can be called with a different threshold
nlp = English() nlp = English()

View File

@ -126,38 +126,26 @@ cdef class Span:
return False return False
else: else:
return True return True
self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.doc)
other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.doc)
# < # <
if op == 0: if op == 0:
return self.c.start_char < other.c.start_char return self_tuple < other_tuple
# <= # <=
elif op == 1: elif op == 1:
return self.c.start_char <= other.c.start_char return self_tuple <= other_tuple
# == # ==
elif op == 2: elif op == 2:
# Do the cheap comparisons first return self_tuple == other_tuple
return (
(self.c.start_char == other.c.start_char) and \
(self.c.end_char == other.c.end_char) and \
(self.c.label == other.c.label) and \
(self.c.kb_id == other.c.kb_id) and \
(self.doc == other.doc)
)
# != # !=
elif op == 3: elif op == 3:
# Do the cheap comparisons first return self_tuple != other_tuple
return not (
(self.c.start_char == other.c.start_char) and \
(self.c.end_char == other.c.end_char) and \
(self.c.label == other.c.label) and \
(self.c.kb_id == other.c.kb_id) and \
(self.doc == other.doc)
)
# > # >
elif op == 4: elif op == 4:
return self.c.start_char > other.c.start_char return self_tuple > other_tuple
# >= # >=
elif op == 5: elif op == 5:
return self.c.start_char >= other.c.start_char return self_tuple >= other_tuple
def __hash__(self): def __hash__(self):
return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id)) return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id))

View File

@ -188,6 +188,7 @@ def conllu_sentence_to_doc(
id_ = int(id_) - 1 id_ = int(id_) - 1
head = (int(head) - 1) if head not in ("0", "_") else id_ head = (int(head) - 1) if head not in ("0", "_") else id_
tag = pos if tag == "_" else tag tag = pos if tag == "_" else tag
pos = pos if pos != "_" else ""
morph = morph if morph != "_" else "" morph = morph if morph != "_" else ""
dep = "ROOT" if dep == "root" else dep dep = "ROOT" if dep == "root" else dep
lemmas.append(lemma) lemmas.append(lemma)

View File

@ -34,7 +34,11 @@ only.
Predictions will be saved to `doc.cats` as a dictionary, where the key is the Predictions will be saved to `doc.cats` as a dictionary, where the key is the
name of the category and the value is a score between 0 and 1 (inclusive). For name of the category and the value is a score between 0 and 1 (inclusive). For
`textcat` (exclusive categories), the scores will sum to 1, while for `textcat` (exclusive categories), the scores will sum to 1, while for
`textcat_multilabel` there is no particular guarantee about their sum. `textcat_multilabel` there is no particular guarantee about their sum. This also
means that for `textcat`, missing values are equated to a value of 0 (i.e.
`False`) and are counted as such towards the loss and scoring metrics. This is
not the case for `textcat_multilabel`, where missing values in the gold standard
data do not influence the loss or accuracy calculations.
Note that when assigning values to create training data, the score of each Note that when assigning values to create training data, the score of each
category must be 0 or 1. Using other values, for example to create a document category must be 0 or 1. Using other values, for example to create a document