From a1f25412da871e14a60033c80616f139bd537b7d Mon Sep 17 00:00:00 2001 From: Richard Hudson Date: Mon, 22 Nov 2021 09:46:34 +0100 Subject: [PATCH 01/12] Edited Slovenian stop words list (#9707) --- spacy/lang/sl/stop_words.py | 130 +----------------------------------- 1 file changed, 1 insertion(+), 129 deletions(-) diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index 6fb01a183..c9004ed5d 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,13 +1,10 @@ # Source: https://github.com/stopwords-iso/stopwords-sl -# TODO: probably needs to be tidied up – the list seems to have month names in -# it, which shouldn't be considered stop words. +# Removed various words that are not normally considered stop words, such as months. STOP_WORDS = set( """ a ali -april -avgust b bi bil @@ -19,7 +16,6 @@ biti blizu bo bodo -bojo bolj bom bomo @@ -37,16 +33,6 @@ da daleč dan danes -datum -december -deset -deseta -deseti -deseto -devet -deveta -deveti -deveto do dober dobra @@ -54,16 +40,7 @@ dobri dobro dokler dol -dolg -dolga -dolgi dovolj -drug -druga -drugi -drugo -dva -dve e eden en @@ -74,7 +51,6 @@ enkrat eno etc. f -februar g g. ga @@ -93,16 +69,12 @@ iv ix iz j -januar jaz je ji jih jim jo -julij -junij -jutri k kadarkoli kaj @@ -123,41 +95,23 @@ kje kjer kjerkoli ko -koder koderkoli koga komu kot -kratek -kratka -kratke -kratki l -lahka -lahke -lahki -lahko le lep lepa lepe lepi lepo -leto m -maj -majhen -majhna -majhni -malce -malo manj -marec me med medtem mene -mesec mi midva midve @@ -183,7 +137,6 @@ najmanj naju največ nam -narobe nas nato nazaj @@ -192,7 +145,6 @@ naša naše ne nedavno -nedelja nek neka nekaj @@ -236,7 +188,6 @@ njuna njuno no nocoj -november npr. o ob @@ -244,51 +195,23 @@ oba obe oboje od -odprt -odprta -odprti okoli -oktober on onadva one oni onidve -osem -osma -osmi -osmo oz. p pa -pet -peta -petek -peti -peto po pod pogosto poleg -poln -polna -polni -polno ponavadi -ponedeljek ponovno potem povsod -pozdravljen -pozdravljeni -prav -prava -prave -pravi -pravo -prazen -prazna -prazno prbl. precej pred @@ -297,19 +220,10 @@ preko pri pribl. približno -primer -pripravljen -pripravljena -pripravljeni proti -prva -prvi -prvo r -ravno redko res -reč s saj sam @@ -321,29 +235,17 @@ se sebe sebi sedaj -sedem -sedma -sedmi -sedmo sem -september seveda si sicer skoraj skozi -slab smo so -sobota spet -sreda -srednja -srednji sta ste -stran -stvar sva t ta @@ -358,10 +260,6 @@ te tebe tebi tega -težak -težka -težki -težko ti tista tiste @@ -371,11 +269,6 @@ tj. tja to toda -torek -tretja -tretje -tretji -tri tu tudi tukaj @@ -392,10 +285,6 @@ vaša vaše ve vedno -velik -velika -veliki -veliko vendar ves več @@ -403,10 +292,6 @@ vi vidva vii viii -visok -visoka -visoke -visoki vsa vsaj vsak @@ -420,34 +305,21 @@ vsega vsi vso včasih -včeraj x z za zadaj zadnji zakaj -zaprta -zaprti -zaprto zdaj zelo zunaj č če često -četrta -četrtek -četrti -četrto čez čigav š -šest -šesta -šesti -šesto -štiri ž že """.split() From 25bd9f9d4876ed966142b01e3d37dd51c8a7c594 Mon Sep 17 00:00:00 2001 From: Duygu Altinok Date: Tue, 23 Nov 2021 16:29:25 +0100 Subject: [PATCH 02/12] Noun chunks for Italian (#9662) * added it vocab * copied portuguese * added possessive determiner * added conjed Nps * added nmoded Nps * test misc * more examples * fixed typo * fixed parenth * fixed comma * comma fix * added syntax iters * fix some index problems * fixed index * corrected heads for test case * fixed tets case * fixed determiner gender * cleaned left over * added example with apostophe --- spacy/lang/it/__init__.py | 4 +- spacy/lang/it/syntax_iterators.py | 86 +++++++++ spacy/tests/conftest.py | 5 + spacy/tests/lang/it/test_noun_chunks.py | 221 ++++++++++++++++++++++++ 4 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/it/syntax_iterators.py create mode 100644 spacy/tests/lang/it/test_noun_chunks.py diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 1edebc837..ecf322bd7 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ...language import Language, BaseDefaults from .lemmatizer import ItalianLemmatizer +from .syntax_iterators import SYNTAX_ITERATORS class ItalianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS - stop_words = STOP_WORDS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS class Italian(Language): diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py new file mode 100644 index 000000000..f63df3fad --- /dev/null +++ b/spacy/lang/it/syntax_iterators.py @@ -0,0 +1,86 @@ +from typing import Union, Iterator, Tuple + +from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors +from ...tokens import Doc, Span + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + labels = [ + "nsubj", + "nsubj:pass", + "obj", + "obl", + "obl:agent", + "nmod", + "pcomp", + "appos", + "ROOT", + ] + post_modifiers = ["flat", "flat:name", "fixed", "compound"] + dets = ["det", "det:poss"] + doc = doclike.doc # Ensure works on both Doc and Span. + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + np_deps = {doc.vocab.strings.add(label) for label in labels} + np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} + np_label = doc.vocab.strings.add("NP") + adj_label = doc.vocab.strings.add("amod") + det_labels = {doc.vocab.strings.add(det) for det in dets} + det_pos = doc.vocab.strings.add("DET") + adp_label = doc.vocab.strings.add("ADP") + conj = doc.vocab.strings.add("conj") + conj_pos = doc.vocab.strings.add("CCONJ") + prev_end = -1 + for i, word in enumerate(doclike): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.left_edge.i <= prev_end: + continue + if word.dep in np_deps: + right_childs = list(word.rights) + right_child = right_childs[0] if right_childs else None + + if right_child: + if ( + right_child.dep == adj_label + ): # allow chain of adjectives by expanding to right + right_end = right_child.right_edge + elif ( + right_child.dep in det_labels and right_child.pos == det_pos + ): # cut relative pronouns here + right_end = right_child + elif right_child.dep in np_modifs: # Check if we can expand to right + right_end = word.right_edge + else: + right_end = word + else: + right_end = word + prev_end = right_end.i + + left_index = word.left_edge.i + left_index = ( + left_index + 1 if word.left_edge.pos == adp_label else left_index + ) + + yield left_index, right_end.i + 1, np_label + elif word.dep == conj: + head = word.head + while head.dep == conj and head.head.i < head.i: + head = head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + prev_end = word.i + + left_index = word.left_edge.i # eliminate left attached conjunction + left_index = ( + left_index + 1 if word.left_edge.pos == conj_pos else left_index + ) + yield left_index, word.i + 1, np_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 88c7adfe3..2e75f9964 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -190,6 +190,11 @@ def it_tokenizer(): return get_lang_class("it")().tokenizer +@pytest.fixture(scope="session") +def it_vocab(): + return get_lang_class("it")().vocab + + @pytest.fixture(scope="session") def ja_tokenizer(): pytest.importorskip("sudachipy") diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py new file mode 100644 index 000000000..0a8c10e79 --- /dev/null +++ b/spacy/tests/lang/it/test_noun_chunks.py @@ -0,0 +1,221 @@ +from spacy.tokens import Doc +import pytest + + +# fmt: off +@pytest.mark.parametrize( + "words,heads,deps,pos,chunk_offsets", + [ + # determiner + noun + # un pollo -> un pollo + ( + ["un", "pollo"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0,2)], + ), + # two determiners + noun + # il mio cane -> il mio cane + ( + ["il", "mio", "cane"], + [2, 2, 2], + ["det", "det:poss", "ROOT"], + ["DET", "DET", "NOUN"], + [(0,3)], + ), + # two determiners, one is after noun. rare usage but still testing + # il cane mio-> il cane mio + ( + ["il", "cane", "mio"], + [1, 1, 1], + ["det", "ROOT", "det:poss"], + ["DET", "NOUN", "DET"], + [(0,3)], + ), + # relative pronoun + # È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty. + ( + ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"], + [2, 2, 2, 4, 2, 7, 7, 4], + ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'], + ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'], + [(3,5), (5,6)] + ), + # relative subclause + # il computer che hai comprato -> il computer, che the computer that you bought + ( + ['il', 'computer', 'che', 'hai', 'comprato'], + [1, 1, 4, 4, 1], + ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'], + ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'], + [(0,2), (2,3)] + ), + # det + noun + adj + # Una macchina grande -> Una macchina grande + ( + ["Una", "macchina", "grande"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0,3)], + ), + # noun + adj plural + # mucche bianche + ( + ["mucche", "bianche"], + [0, 0], + ["ROOT", "amod"], + ["NOUN", "ADJ"], + [(0,2)], + ), + # det + adj + noun + # Una grande macchina -> Una grande macchina + ( + ['Una', 'grande', 'macchina'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # det + adj + noun, det with apostrophe + # un'importante associazione -> un'importante associazione + ( + ["Un'", 'importante', 'associazione'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # multiple adjectives + # Un cane piccolo e marrone -> Un cane piccolo e marrone + ( + ["Un", "cane", "piccolo", "e", "marrone"], + [1, 1, 1, 4, 2], + ["det", "ROOT", "amod", "cc", "conj"], + ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], + [(0,5)] + ), + # determiner, adjective, compound created by flat + # le Nazioni Unite -> le Nazioni Unite + ( + ["le", "Nazioni", "Unite"], + [1, 1, 1], + ["det", "ROOT", "flat:name"], + ["DET", "PROPN", "PROPN"], + [(0,3)] + ), + # one determiner + one noun + one adjective qualified by an adverb + # alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers + ( + ['alcuni', 'contadini', 'molto', 'ricchi'], + [1, 1, 3, 1], + ['det', 'ROOT', 'advmod', 'amod'], + ['DET', 'NOUN', 'ADV', 'ADJ'], + [(0,4)] + ), + # Two NPs conjuncted + # Ho un cane e un gatto -> un cane, un gatto + ( + ['Ho', 'un', 'cane', 'e', 'un', 'gatto'], + [0, 2, 0, 5, 5, 0], + ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'], + ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], + [(1,3), (4,6)] + + ), + # Two NPs together + # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado + ( + ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'], + [1, 1, 1, 1, 3], + ['det', 'ROOT', 'amod', 'nmod', 'flat:name'], + ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], + [(0, 3), (3, 5)] + ), + # Noun compound, person name and titles + # Dom Pedro II -> Dom Pedro II + ( + ["Dom", "Pedro", "II"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # Noun compound created by flat + # gli Stati Uniti + ( + ["gli", "Stati", "Uniti"], + [1, 1, 1], + ["det", "ROOT", "flat:name"], + ["DET", "PROPN", "PROPN"], + [(0,3)] + ), + # nmod relation between NPs + # la distruzione della città -> la distruzione, città + ( + ['la', 'distruzione', 'della', 'città'], + [1, 1, 3, 1], + ['det', 'ROOT', 'case', 'nmod'], + ['DET', 'NOUN', 'ADP', 'NOUN'], + [(0,2), (3,4)] + ), + # Compounding by nmod, several NPs chained together + # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo + ( + ["la", "prima", "fabbrica", "di", "droga", "del", "governo"], + [2, 2, 2, 4, 2, 6, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(0, 3), (4, 5), (6, 7)] + ), + # several NPs + # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana + ( + ['Traduzione', 'del', 'rapporto', 'di', 'Susana'], + [0, 2, 0, 4, 2], + ['ROOT', 'case', 'nmod', 'case', 'nmod'], + ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3), (4,5)] + + ), + # Several NPs + # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica + ( + ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'], + [1, 1, 1, 4, 1, 8, 8, 8, 1], + ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'], + ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'], + [(0,3), (4,5), (6,9)] + ), + # Passive subject + # La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton + ( + ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'], + [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9], + ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0, 3), (6, 8), (9, 10), (11,12)] + ), + # Misc + # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti + ( + ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'], + [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17], + ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'], + ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(2,4), (9,12), (13,14), (17,18), (19,20)] + ) + ], +) +# fmt: on +def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets): + doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos) + assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets + + +def test_noun_chunks_is_parsed_it(it_tokenizer): + """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed.""" + doc = it_tokenizer("Sei andato a Oxford") + with pytest.raises(ValueError): + list(doc.noun_chunks) From 29f28d1f3e9d6a6ab19dc6edb7247c0d3f22df98 Mon Sep 17 00:00:00 2001 From: Duygu Altinok Date: Tue, 30 Nov 2021 12:19:07 +0100 Subject: [PATCH 03/12] French NP review (#9667) * adapted from pt * added basic tests * added fr vocab * fixed noun chunks * more examples * typo fix * changed naming * changed the naming * typo fix --- spacy/lang/fr/syntax_iterators.py | 72 ++++++-- spacy/tests/conftest.py | 5 + spacy/tests/lang/fr/test_noun_chunks.py | 224 +++++++++++++++++++++++- 3 files changed, 288 insertions(+), 13 deletions(-) diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index d86662693..5f7ba5c10 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -6,16 +6,35 @@ from ...tokens import Doc, Span def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: - """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" - # fmt: off - labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] - # fmt: on + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + labels = [ + "nsubj", + "nsubj:pass", + "obj", + "obl", + "obl:agent", + "obl:arg", + "obl:mod", + "nmod", + "pcomp", + "appos", + "ROOT", + ] + post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"] doc = doclike.doc # Ensure works on both Doc and Span. if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) - np_deps = [doc.vocab.strings[label] for label in labels] - conj = doc.vocab.strings.add("conj") + np_deps = {doc.vocab.strings.add(label) for label in labels} + np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} np_label = doc.vocab.strings.add("NP") + adj_label = doc.vocab.strings.add("amod") + det_label = doc.vocab.strings.add("det") + det_pos = doc.vocab.strings.add("DET") + adp_pos = doc.vocab.strings.add("ADP") + conj_label = doc.vocab.strings.add("conj") + conj_pos = doc.vocab.strings.add("CCONJ") prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): @@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i - yield word.left_edge.i, word.right_edge.i + 1, np_label - elif word.dep == conj: + right_childs = list(word.rights) + right_child = right_childs[0] if right_childs else None + + if right_child: + if ( + right_child.dep == adj_label + ): # allow chain of adjectives by expanding to right + right_end = right_child.right_edge + elif ( + right_child.dep == det_label and right_child.pos == det_pos + ): # cut relative pronouns here + right_end = right_child + elif right_child.dep in np_modifs: # Check if we can expand to right + right_end = word.right_edge + else: + right_end = word + else: + right_end = word + prev_end = right_end.i + + left_index = word.left_edge.i + left_index = ( + left_index + 1 if word.left_edge.pos == adp_pos else left_index + ) + + yield left_index, right_end.i + 1, np_label + elif word.dep == conj_label: head = word.head - while head.dep == conj and head.head.i < head.i: + while head.dep == conj_label and head.head.i < head.i: head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i - yield word.left_edge.i, word.right_edge.i + 1, np_label + prev_end = word.i + + left_index = word.left_edge.i # eliminate left attached conjunction + left_index = ( + left_index + 1 if word.left_edge.pos == conj_pos else left_index + ) + yield left_index, word.i + 1, np_label SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 2e75f9964..002a8f027 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -145,6 +145,11 @@ def fr_tokenizer(): return get_lang_class("fr")().tokenizer +@pytest.fixture(scope="session") +def fr_vocab(): + return get_lang_class("fr")().vocab + + @pytest.fixture(scope="session") def ga_tokenizer(): return get_lang_class("ga")().tokenizer diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index 48ac88ead..25b95f566 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -1,8 +1,230 @@ +from spacy.tokens import Doc import pytest +# fmt: off +@pytest.mark.parametrize( + "words,heads,deps,pos,chunk_offsets", + [ + # determiner + noun + # un nom -> un nom + ( + ["un", "nom"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # determiner + noun starting with vowel + # l'heure -> l'heure + ( + ["l'", "heure"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # determiner + plural noun + # les romans -> les romans + ( + ["les", "romans"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # det + adj + noun + # Le vieux Londres -> Le vieux Londres + ( + ['Les', 'vieux', 'Londres'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # det + noun + adj + # le nom propre -> le nom propre a proper noun + ( + ["le", "nom", "propre"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0, 3)], + ), + # det + noun + adj plural + # Les chiens bruns -> les chiens bruns + ( + ["Les", "chiens", "bruns"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0, 3)], + ), + # multiple adjectives: one adj before the noun, one adj after the noun + # un nouveau film intéressant -> un nouveau film intéressant + ( + ["un", "nouveau", "film", "intéressant"], + [2, 2, 2, 2], + ["det", "amod", "ROOT", "amod"], + ["DET", "ADJ", "NOUN", "ADJ"], + [(0,4)] + ), + # multiple adjectives, both adjs after the noun + # une personne intelligente et drôle -> une personne intelligente et drôle + ( + ["une", "personne", "intelligente", "et", "drôle"], + [1, 1, 1, 4, 2], + ["det", "ROOT", "amod", "cc", "conj"], + ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], + [(0,5)] + ), + # relative pronoun + # un bus qui va au ville -> un bus, qui, ville + ( + ['un', 'bus', 'qui', 'va', 'au', 'ville'], + [1, 1, 3, 1, 5, 3], + ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'], + ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'], + [(0,2), (2,3), (5,6)] + ), + # relative subclause + # Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy. + ( + ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'], + [0, 2, 0, 5, 5, 2, 5], + ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'], + ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'], + [(1,3), (4,5)] + ), + # Person name and title by flat + # Louis XIV -> Louis XIV + ( + ["Louis", "XIV"], + [0, 0], + ["ROOT", "flat:name"], + ["PROPN", "PROPN"], + [(0,2)] + ), + # Organization name by flat + # Nations Unies -> Nations Unies + ( + ["Nations", "Unies"], + [0, 0], + ["ROOT", "flat:name"], + ["PROPN", "PROPN"], + [(0,2)] + ), + # Noun compound, person name created by two flats + # Louise de Bratagne -> Louise de Bratagne + ( + ["Louise", "de", "Bratagne"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # Noun compound, person name created by two flats + # Louis François Joseph -> Louis François Joseph + ( + ["Louis", "François", "Joseph"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # one determiner + one noun + one adjective qualified by an adverb + # quelques agriculteurs très riches -> quelques agriculteurs très riches + ( + ["quelques", "agriculteurs", "très", "riches"], + [1, 1, 3, 1], + ['det', 'ROOT', 'advmod', 'amod'], + ['DET', 'NOUN', 'ADV', 'ADJ'], + [(0,4)] + ), + # Two NPs conjuncted + # Il a un chien et un chat -> Il, un chien, un chat + ( + ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'], + [1, 1, 3, 1, 6, 6, 3], + ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'], + ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], + [(0,1), (2,4), (5,7)] + + ), + # Two NPs together + # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado + ( + ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'], + [1, 1, 1, 1, 3], + ['det', 'ROOT', 'amod', 'appos', 'flat:name'], + ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], + [(0, 3), (3, 5)] + ), + # nmod relation between NPs + # la destruction de la ville -> la destruction, la ville + ( + ['la', 'destruction', 'de', 'la', 'ville'], + [1, 1, 4, 4, 1], + ['det', 'ROOT', 'case', 'det', 'nmod'], + ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'], + [(0,2), (3,5)] + ), + # nmod relation between NPs + # Archiduchesse d’Autriche -> Archiduchesse, Autriche + ( + ['Archiduchesse', 'd’', 'Autriche'], + [0, 2, 0], + ['ROOT', 'case', 'nmod'], + ['NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3)] + ), + # Compounding by nmod, several NPs chained together + # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement + ( + ["la", "première", "usine", "de", "drogue", "du", "gouvernement"], + [2, 2, 2, 4, 2, 6, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(0, 3), (4, 5), (6, 7)] + ), + # several NPs + # Traduction du rapport de Susana -> Traduction, rapport, Susana + ( + ['Traduction', 'du', 'raport', 'de', 'Susana'], + [0, 2, 0, 4, 2], + ['ROOT', 'case', 'nmod', 'case', 'nmod'], + ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3), (4,5)] + + ), + # Several NPs + # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie + ( + ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'], + [2, 2, 2, 4, 2, 7, 7, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'], + [(0,3), (4,5), (6,8)] + ), + # Passive subject + # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton + ( + ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'], + [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8], + ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'], + [(0, 3), (6, 10), (11, 12)] + ) + ], +) +# fmt: on +def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets): + doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos) + assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets + + def test_noun_chunks_is_parsed_fr(fr_tokenizer): """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.""" - doc = fr_tokenizer("trouver des travaux antérieurs") + doc = fr_tokenizer("Je suis allé à l'école") with pytest.raises(ValueError): list(doc.noun_chunks) From b4d526c357a606775e870c2dbe2a794140517d5d Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 30 Nov 2021 22:36:39 +0000 Subject: [PATCH 04/12] Add Japanese kana characters to default exceptions (fix #9693) (#9742) This includes the main kana, or phonetic characters, used in Japanese. There are some supplemental kana blocks in Unicode outside the BMP that could also be included, but because their actual use is rare I omitted them for now, but maybe they should be added. The omitted blocks are: - Kana Supplement - Kana Extended (A and B) - Small Kana Extension --- spacy/lang/char_classes.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 9e5441a4f..b15bb3cf3 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF" _hangul_jamo = r"\u1100-\u11FF" _hangul = _hangul_syllables + _hangul_jamo +_hiragana = r"\u3040-\u309F" +_katakana = r"\u30A0-\u30FFー" +_kana = _hiragana + _katakana + # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh _latin_u_extendedA = ( r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" @@ -244,6 +248,7 @@ _uncased = ( + _tamil + _telugu + _hangul + + _kana + _cjk ) From 251119455de70957088970ca0aa56624789ea65c Mon Sep 17 00:00:00 2001 From: Haakon Meland Eriksen Date: Tue, 7 Dec 2021 09:45:10 +0100 Subject: [PATCH 05/12] Remove NER words from stop words in Norwegian (#9820) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default stop words in Norwegian bokmål (nb) in Spacy contain important entities, e.g. France, Germany, Russia, Sweden and USA, police district, important units of time, e.g. months and days of the week, and organisations. Nobody expects their presence among the default stop words. There is a danger of users complying with the general recommendation of filtering out stop words, while being unaware of filtering out important entities from their data. See explanation in https://github.com/explosion/spaCy/issues/3052#issuecomment-986756711 and comment https://github.com/explosion/spaCy/issues/3052#issuecomment-986951831 --- spacy/lang/nb/stop_words.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py index fd65dd788..d9ed414ef 100644 --- a/spacy/lang/nb/stop_words.py +++ b/spacy/lang/nb/stop_words.py @@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av bak bare bedre beste blant ble bli blir blitt bris by både -da dag de del dem den denne der dermed det dette disse drept du +da dag de del dem den denne der dermed det dette disse du eller en enn er et ett etter -fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag +fem fikk fire fjor flere folk for fortsatt fra fram funnet få får fått før først første gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går -ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan -hvorfor +ha hadde ham han hans har hele helt henne hennes her hun i ifølge igjen ikke ingen inn ja jeg kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld -kvinner -la laget land landet langt leder ligger like litt løpet lørdag +la laget land landet langt leder ligger like litt løpet -man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer -millioner minutter mot msci mye må mål måtte +man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte -ned neste noe noen nok norge norsk norske ntb ny nye nå når +ned neste noe noen nok ny nye nå når -og også om onsdag opp opplyser oslo oss over +og også om opp opplyser oss over -personer plass poeng politidistrikt politiet president prosent på +personer plass poeng på -regjeringen runde rundt russland +runde rundt -sa saken samme sammen samtidig satt se seg seks selv senere september ser sett +sa saken samme sammen samtidig satt se seg seks selv senere ser sett siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor -store står sverige svært så søndag +store står svært så -ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror -tyskland +ta tatt tid tidligere til tilbake tillegg tok tror -under usa ut uten utenfor +under ut uten utenfor vant var ved veldig vi videre viktig vil ville viser vår være vært From 3cfeb518ee5a54742366ea5ad60ead420dcd8e3d Mon Sep 17 00:00:00 2001 From: Andrew Janco Date: Tue, 21 Dec 2021 09:46:33 -0500 Subject: [PATCH 06/12] Handle "_" value for token pos in conllu data (#9903) * change '_' to '' to allow Token.pos, when no value for token pos in conllu data * Minor code style Co-authored-by: Adriane Boyd --- spacy/training/converters/conllu_to_docs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py index 66156b6e5..7a4f44d3b 100644 --- a/spacy/training/converters/conllu_to_docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -188,6 +188,7 @@ def conllu_sentence_to_doc( id_ = int(id_) - 1 head = (int(head) - 1) if head not in ("0", "_") else id_ tag = pos if tag == "_" else tag + pos = pos if pos != "_" else "" morph = morph if morph != "_" else "" dep = "ROOT" if dep == "root" else dep lemmas.append(lemma) From 7ec1452f5fe2aea2aa74c4910a9a7903d979fb66 Mon Sep 17 00:00:00 2001 From: Duygu Altinok Date: Thu, 23 Dec 2021 13:41:01 +0100 Subject: [PATCH 07/12] added ellided forms (#9878) * added ellided forms * rearranged a bit * rearranged a bit * added stopword tests * blacked tests file --- spacy/lang/it/stop_words.py | 30 +++++++++++++-------------- spacy/tests/lang/it/test_stopwords.py | 17 +++++++++++++++ 2 files changed, 32 insertions(+), 15 deletions(-) create mode 100644 spacy/tests/lang/it/test_stopwords.py diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index 4178ed452..42adc7904 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -10,18 +10,18 @@ avresti avrete avrà avrò avuta avute avuti avuto basta bene benissimo brava bravo -casa caso cento certa certe certi certo che chi chicchessia chiunque ci +casa caso cento certa certe certi certo che chi chicchessia chiunque ci c' ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto cogli coi col colei coll coloro colui come cominci comunque con concernente conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui -da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli -dei del dell della delle dello dentro detto deve di dice dietro dire +d' da dagl dagli dai dal dall dall' dalla dalle dallo dappertutto davanti degl degli +dei del dell dell' della delle dello dentro detto deve di dice dietro dire dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due dunque durante -ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era -erano eravamo eravate eri ero esempio esse essendo esser essere essi ex +e ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era +erano eravamo eravate eri ero esempio esse essendo esser essere essi ex è fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero facessi facessimo faceste facesti faceva facevamo facevano facevate facevi @@ -30,21 +30,21 @@ fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra frattempo fu fui fummo fuori furono futuro generale -gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo +gia già giacche giorni giorno gli gl' gliela gliele glieli glielo gliene governo grande grazie gruppo ha haha hai hanno ho ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io -la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo +l' la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo -ma macche magari maggior mai male malgrado malissimo mancanza marche me +m' ma macche magari maggior mai male malgrado malissimo mancanza marche me medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto -nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun -nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre +nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun nessun' +nessuna nessuno nient' niente no noi non nondimeno nonostante nonsia nostra nostre nostri nostro novanta nove nulla nuovo od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto @@ -56,12 +56,12 @@ potrebbe preferibilmente presa press prima primo principalmente probabilmente proprio puo può pure purtroppo qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante -quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest +quanti quanto quantunque quasi quattro quel quel' quella quelle quelli quello quest quest' questa queste questi questo qui quindi realmente recente recentemente registrazione relativo riecco salvo -sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste +s' sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando @@ -72,12 +72,12 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua subito successivamente successivo sue sugl sugli sui sul sull sulla sulle sullo suo suoi -tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta +t' tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto -uguali ulteriore ultimo un una uno uomo +uguali ulteriore ultimo un un' una uno uomo -va vale vari varia varie vario verso vi via vicino visto vita voi volta volte +v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte vostra vostre vostri vostro """.split() ) diff --git a/spacy/tests/lang/it/test_stopwords.py b/spacy/tests/lang/it/test_stopwords.py new file mode 100644 index 000000000..954913164 --- /dev/null +++ b/spacy/tests/lang/it/test_stopwords.py @@ -0,0 +1,17 @@ +import pytest + + +@pytest.mark.parametrize( + "word", ["un", "lo", "dell", "dall", "si", "ti", "mi", "quest", "quel", "quello"] +) +def test_stopwords_basic(it_tokenizer, word): + tok = it_tokenizer(word)[0] + assert tok.is_stop + + +@pytest.mark.parametrize( + "word", ["quest'uomo", "l'ho", "un'amica", "dell'olio", "s'arrende", "m'ascolti"] +) +def test_stopwords_elided(it_tokenizer, word): + tok = it_tokenizer(word)[0] + assert tok.is_stop From 86e71e7b19a70da7139b33b88bc4ce89e9142f47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20C=C3=A4sar?= Date: Wed, 29 Dec 2021 11:04:39 +0100 Subject: [PATCH 08/12] Fix Scorer.score_cats for missing labels (#9443) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix Scorer.score_cats for missing labels * Add test case for Scorer.score_cats missing labels * semantic nitpick * black formatting * adjust test to give different results depending on multi_label setting * fix loss function according to whether or not missing values are supported * add note to docs * small fixes * make mypy happy * Update spacy/pipeline/textcat.py Co-authored-by: Florian Cäsar Co-authored-by: Sofie Van Landeghem Co-authored-by: svlandeg --- spacy/pipeline/senter.pyx | 2 +- spacy/pipeline/spancat.py | 3 +- spacy/pipeline/textcat.py | 11 ++++- spacy/pipeline/textcat_multilabel.py | 10 +++-- spacy/scorer.py | 24 +++++----- spacy/tests/pipeline/test_textcat.py | 66 ++++++++++++++++++++++++++++ website/docs/api/textcategorizer.md | 6 ++- 7 files changed, 103 insertions(+), 19 deletions(-) diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 54ce021af..2e0f364f0 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,6 +1,6 @@ # cython: infer_types=True, profile=True, binding=True -from itertools import islice from typing import Optional, Callable +from itertools import islice import srsly from thinc.api import Model, SequenceCategoricalCrossentropy, Config diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 829def1eb..01c9c407f 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,9 +1,10 @@ -import numpy from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Optimizer from thinc.types import Ragged, Ints2d, Floats2d, Ints1d +import numpy + from ..compat import Protocol, runtime_checkable from ..scorer import Scorer from ..language import Language diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 30a65ec52..e20ae87f1 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,8 +1,8 @@ -from itertools import islice from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config from thinc.types import Floats2d import numpy +from itertools import islice from .trainable_pipe import TrainablePipe from ..language import Language @@ -158,6 +158,13 @@ class TextCategorizer(TrainablePipe): self.cfg = dict(cfg) self.scorer = scorer + @property + def support_missing_values(self): + # There are no missing values as the textcat should always + # predict exactly one label. All other labels are 0.0 + # Subclasses may override this property to change internal behaviour. + return False + @property def labels(self) -> Tuple[str]: """RETURNS (Tuple[str]): The labels currently added to the component. @@ -294,7 +301,7 @@ class TextCategorizer(TrainablePipe): for j, label in enumerate(self.labels): if label in eg.reference.cats: truths[i, j] = eg.reference.cats[label] - else: + elif self.support_missing_values: not_missing[i, j] = 0.0 truths = self.model.ops.asarray(truths) # type: ignore return truths, not_missing # type: ignore diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index a7bfacca7..e33a885f8 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -1,8 +1,8 @@ -from itertools import islice from typing import Iterable, Optional, Dict, List, Callable, Any - -from thinc.api import Model, Config from thinc.types import Floats2d +from thinc.api import Model, Config + +from itertools import islice from ..language import Language from ..training import Example, validate_get_examples @@ -158,6 +158,10 @@ class MultiLabel_TextCategorizer(TextCategorizer): self.cfg = dict(cfg) self.scorer = scorer + @property + def support_missing_values(self): + return True + def initialize( # type: ignore[override] self, get_examples: Callable[[], Iterable[Example]], diff --git a/spacy/scorer.py b/spacy/scorer.py index 4d596b5e1..ae9338bd5 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -445,7 +445,8 @@ class Scorer: getter(doc, attr) should return the values for the individual doc. labels (Iterable[str]): The set of possible labels. Defaults to []. multi_label (bool): Whether the attribute allows multiple labels. - Defaults to True. + Defaults to True. When set to False (exclusive labels), missing + gold labels are interpreted as 0.0. positive_label (str): The positive label for a binary task with exclusive classes. Defaults to None. threshold (float): Cutoff to consider a prediction "positive". Defaults @@ -484,13 +485,15 @@ class Scorer: for label in labels: pred_score = pred_cats.get(label, 0.0) - gold_score = gold_cats.get(label, 0.0) + gold_score = gold_cats.get(label) + if not gold_score and not multi_label: + gold_score = 0.0 if gold_score is not None: auc_per_type[label].score_set(pred_score, gold_score) if multi_label: for label in labels: pred_score = pred_cats.get(label, 0.0) - gold_score = gold_cats.get(label, 0.0) + gold_score = gold_cats.get(label) if gold_score is not None: if pred_score >= threshold and gold_score > 0: f_per_type[label].tp += 1 @@ -502,16 +505,15 @@ class Scorer: # Get the highest-scoring for each. pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1]) - if gold_score is not None: - if pred_label == gold_label and pred_score >= threshold: - f_per_type[pred_label].tp += 1 - else: - f_per_type[gold_label].fn += 1 - if pred_score >= threshold: - f_per_type[pred_label].fp += 1 + if pred_label == gold_label and pred_score >= threshold: + f_per_type[pred_label].tp += 1 + else: + f_per_type[gold_label].fn += 1 + if pred_score >= threshold: + f_per_type[pred_label].fp += 1 elif gold_cats: gold_label, gold_score = max(gold_cats, key=lambda it: it[1]) - if gold_score is not None and gold_score > 0: + if gold_score > 0: f_per_type[gold_label].fn += 1 elif pred_cats: pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 282789f2b..52bf6ec5c 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -725,6 +725,72 @@ def test_textcat_evaluation(): assert scores["cats_micro_r"] == 4 / 6 +@pytest.mark.parametrize( + "multi_label,spring_p", + [(True, 1 / 1), (False, 1 / 2)], +) +def test_textcat_eval_missing(multi_label: bool, spring_p: float): + """ + multi-label: the missing 'spring' in gold_doc_2 doesn't incur a penalty + exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0""" + train_examples = [] + nlp = English() + + ref1 = nlp("one") + ref1.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0} + pred1 = nlp("one") + pred1.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0} + train_examples.append(Example(ref1, pred1)) + + ref2 = nlp("two") + # reference 'spring' is missing, pred 'spring' is 1 + ref2.cats = {"winter": 0.0, "summer": 0.0, "autumn": 1.0} + pred2 = nlp("two") + pred2.cats = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0} + train_examples.append(Example(pred2, ref2)) + + scores = Scorer().score_cats( + train_examples, + "cats", + labels=["winter", "summer", "spring", "autumn"], + multi_label=multi_label, + ) + assert scores["cats_f_per_type"]["spring"]["p"] == spring_p + assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 1 + + +@pytest.mark.parametrize( + "multi_label,expected_loss", + [(True, 0), (False, 0.125)], +) +def test_textcat_loss(multi_label: bool, expected_loss: float): + """ + multi-label: the missing 'spring' in gold_doc_2 doesn't incur an increase in loss + exclusive labels: the missing 'spring' in gold_doc_2 is interpreted as 0.0 and adds to the loss""" + train_examples = [] + nlp = English() + + doc1 = nlp("one") + cats1 = {"winter": 0.0, "summer": 0.0, "autumn": 0.0, "spring": 1.0} + train_examples.append(Example.from_dict(doc1, {"cats": cats1})) + + doc2 = nlp("two") + cats2 = {"winter": 0.0, "summer": 0.0, "autumn": 1.0} + train_examples.append(Example.from_dict(doc2, {"cats": cats2})) + + if multi_label: + textcat = nlp.add_pipe("textcat_multilabel") + else: + textcat = nlp.add_pipe("textcat") + textcat.initialize(lambda: train_examples) + assert isinstance(textcat, TextCategorizer) + scores = textcat.model.ops.asarray( + [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0]], dtype="f" # type: ignore + ) + loss, d_scores = textcat.get_loss(train_examples, scores) + assert loss == expected_loss + + def test_textcat_threshold(): # Ensure the scorer can be called with a different threshold nlp = English() diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 47f868637..2ff569bad 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -34,7 +34,11 @@ only. Predictions will be saved to `doc.cats` as a dictionary, where the key is the name of the category and the value is a score between 0 and 1 (inclusive). For `textcat` (exclusive categories), the scores will sum to 1, while for -`textcat_multilabel` there is no particular guarantee about their sum. +`textcat_multilabel` there is no particular guarantee about their sum. This also +means that for `textcat`, missing values are equated to a value of 0 (i.e. +`False`) and are counted as such towards the loss and scoring metrics. This is +not the case for `textcat_multilabel`, where missing values in the gold standard +data do not influence the loss or accuracy calculations. Note that when assigning values to create training data, the score of each category must be 0 or 1. Using other values, for example to create a document From 176a90edeec38ced8c5b1e2f7fd1d28bf1e9e1c1 Mon Sep 17 00:00:00 2001 From: jsnfly <37632631+jsnfly@users.noreply.github.com> Date: Thu, 13 Jan 2022 09:03:23 +0100 Subject: [PATCH 09/12] Fix texcat loss scaling (#9904) (#10002) * add failing test for issue 9904 * remove division by batch size and summation before applying the mean Co-authored-by: jonas --- spacy/pipeline/textcat.py | 4 ++-- spacy/tests/pipeline/test_textcat.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index e20ae87f1..dd5fdc078 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -320,9 +320,9 @@ class TextCategorizer(TrainablePipe): self._validate_categories(examples) truths, not_missing = self._examples_to_truth(examples) not_missing = self.model.ops.asarray(not_missing) # type: ignore - d_scores = (scores - truths) / scores.shape[0] + d_scores = (scores - truths) d_scores *= not_missing - mean_square_error = (d_scores ** 2).sum(axis=1).mean() + mean_square_error = (d_scores ** 2).mean() return float(mean_square_error), d_scores def add_label(self, label: str) -> int: diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 52bf6ec5c..798dd165e 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -277,6 +277,21 @@ def test_issue7019(): print_prf_per_type(msg, scores, name="foo", type="bar") +@pytest.mark.issue(9904) +def test_issue9904(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + get_examples = make_get_examples_single_label(nlp) + nlp.initialize(get_examples) + + examples = get_examples() + scores = textcat.predict([eg.predicted for eg in examples]) + + loss = textcat.get_loss(examples, scores)[0] + loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0] + assert loss == pytest.approx(loss_double_bs) + + @pytest.mark.skip(reason="Test is flakey when run with others") def test_simple_train(): nlp = Language() From 677c1a35072ff2deb3af6638802f506d623ed8f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 13 Jan 2022 09:03:55 +0100 Subject: [PATCH 10/12] Speed up the StateC::L feature function (#10019) * Speed up the StateC::L feature function This function gets the n-th most-recent left-arc with a particular head. Before this change, StateC::L would construct a vector of all left-arcs with the given head and then pick the n-th most recent from that vector. Since the number of left-arcs strongly correlates with the doc length and the feature is constructed for every transition, this can make transition-parsing quadratic. With this change StateC::L: - Searches left-arcs backwards. - Stops early when the n-th matching transition is found. - Does not construct a vector (reducing memory pressure). This change doesn't avoid the linear search when the transition that is queried does not occur in the left-arcs. Regardless, performance is improved quite a bit with very long docs: Before: N Time 400 3.3 800 5.4 1600 11.6 3200 30.7 After: N Time 400 3.2 800 5.0 1600 9.5 3200 23.2 We can probably do better with more tailored data structures, but I first wanted to make a low-impact PR. Found while investigating #9858. * StateC::L: simplify loop --- spacy/pipeline/_parser_internals/_state.pxd | 23 +++++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index 161f3ca48..27623e7c6 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -1,3 +1,4 @@ +from cython.operator cimport dereference as deref, preincrement as incr from libc.string cimport memcpy, memset from libc.stdlib cimport calloc, free from libc.stdint cimport uint32_t, uint64_t @@ -184,16 +185,20 @@ cdef cppclass StateC: int L(int head, int idx) nogil const: if idx < 1 or this._left_arcs.size() == 0: return -1 - cdef vector[int] lefts - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) + + # Work backwards through left-arcs to find the arc at the + # requested index more quickly. + cdef size_t child_index = 0 + it = this._left_arcs.const_rbegin() + while it != this._left_arcs.rend(): + arc = deref(it) if arc.head == head and arc.child != -1 and arc.child < head: - lefts.push_back(arc.child) - idx = (lefts.size()) - idx - if idx < 0: - return -1 - else: - return lefts.at(idx) + child_index += 1 + if child_index == idx: + return arc.child + incr(it) + + return -1 int R(int head, int idx) nogil const: if idx < 1 or this._right_arcs.size() == 0: From 63fa55089dff3b5a5208c24914cd0faa5909108a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Thu, 13 Jan 2022 10:33:30 +0100 Subject: [PATCH 11/12] Use constant-time head lookups in StateC::{L,R} This change changes the type of left/right-arc collections from vector[ArcC] to unordered_map[int, vector[Arc]], so that the arcs are keyed by the head. This allows us to find all the left/right arcs for a particular head in constant time in StateC::{L,R}. Benchmarks with long docs (N is the number of text repetitions): Before (using #10019): N Time (s) 400 3.2 800 5.0 1600 9.5 3200 23.2 6400 66.8 12800 220.0 After (this commit): N Time (s) 400 3.1 800 4.3 1600 6.7 3200 12.0 6400 22.0 12800 42.0 Related to #9858 and #10019. --- spacy/pipeline/_parser_internals/_state.pxd | 120 ++++++++++++-------- 1 file changed, 70 insertions(+), 50 deletions(-) diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index 27623e7c6..a1262bb61 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -3,6 +3,7 @@ from libc.string cimport memcpy, memset from libc.stdlib cimport calloc, free from libc.stdint cimport uint32_t, uint64_t cimport libcpp +from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector from libcpp.set cimport set from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno @@ -30,8 +31,8 @@ cdef cppclass StateC: vector[int] _stack vector[int] _rebuffer vector[SpanC] _ents - vector[ArcC] _left_arcs - vector[ArcC] _right_arcs + unordered_map[int, vector[ArcC]] _left_arcs + unordered_map[int, vector[ArcC]] _right_arcs vector[libcpp.bool] _unshiftable set[int] _sent_starts TokenC _empty_token @@ -160,15 +161,22 @@ cdef cppclass StateC: else: return &this._sent[i] - void get_arcs(vector[ArcC]* arcs) nogil const: - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) - if arc.head != -1 and arc.child != -1: - arcs.push_back(arc) - for i in range(this._right_arcs.size()): - arc = this._right_arcs.at(i) - if arc.head != -1 and arc.child != -1: - arcs.push_back(arc) + void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) nogil const: + cdef const vector[ArcC]* arcs + head_arcs_it = heads_arcs.const_begin() + while head_arcs_it != heads_arcs.const_end(): + arcs = &deref(head_arcs_it).second + arcs_it = arcs.const_begin() + while arcs_it != arcs.const_end(): + arc = deref(arcs_it) + if arc.head != -1 and arc.child != -1: + out.push_back(arc) + incr(arcs_it) + incr(head_arcs_it) + + void get_arcs(vector[ArcC]* out) nogil const: + this.map_get_arcs(this._left_arcs, out) + this.map_get_arcs(this._right_arcs, out) int H(int child) nogil const: if child >= this.length or child < 0: @@ -182,37 +190,35 @@ cdef cppclass StateC: else: return this._ents.back().start - int L(int head, int idx) nogil const: - if idx < 1 or this._left_arcs.size() == 0: + int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) nogil const: + if idx < 1: return -1 - # Work backwards through left-arcs to find the arc at the + head_arcs_it = heads_arcs.const_find(head) + if head_arcs_it == heads_arcs.const_end(): + return -1 + + cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second + + # Work backwards through arcs to find the arc at the # requested index more quickly. cdef size_t child_index = 0 - it = this._left_arcs.const_rbegin() - while it != this._left_arcs.rend(): - arc = deref(it) - if arc.head == head and arc.child != -1 and arc.child < head: + arcs_it = arcs.const_rbegin() + while arcs_it != arcs.const_rend() and child_index != idx: + arc = deref(arcs_it) + if arc.child != -1: child_index += 1 if child_index == idx: return arc.child - incr(it) + incr(arcs_it) return -1 + int L(int head, int idx) nogil const: + return this.nth_child(this._left_arcs, head, idx) + int R(int head, int idx) nogil const: - if idx < 1 or this._right_arcs.size() == 0: - return -1 - cdef vector[int] rights - for i in range(this._right_arcs.size()): - arc = this._right_arcs.at(i) - if arc.head == head and arc.child != -1 and arc.child > head: - rights.push_back(arc.child) - idx = (rights.size()) - idx - if idx < 0: - return -1 - else: - return rights.at(idx) + return this.nth_child(this._right_arcs, head, idx) bint empty() nogil const: return this._stack.size() == 0 @@ -253,22 +259,29 @@ cdef cppclass StateC: int r_edge(int word) nogil const: return word - - int n_L(int head) nogil const: + + int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) nogil const: cdef int n = 0 - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) - if arc.head == head and arc.child != -1 and arc.child < arc.head: + head_arcs_it = heads_arcs.const_find(head) + if head_arcs_it == heads_arcs.const_end(): + return n + + cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second + arcs_it = arcs.const_begin() + while arcs_it != arcs.end(): + arc = deref(arcs_it) + if arc.child != -1: n += 1 + incr(arcs_it) + return n + + int n_L(int head) nogil const: + return n_arcs(this._left_arcs, head) + int n_R(int head) nogil const: - cdef int n = 0 - for i in range(this._right_arcs.size()): - arc = this._right_arcs.at(i) - if arc.head == head and arc.child != -1 and arc.child > arc.head: - n += 1 - return n + return n_arcs(this._right_arcs, head) bint stack_is_connected() nogil const: return False @@ -328,19 +341,20 @@ cdef cppclass StateC: arc.child = child arc.label = label if head > child: - this._left_arcs.push_back(arc) + this._left_arcs[arc.head].push_back(arc) else: - this._right_arcs.push_back(arc) + this._right_arcs[arc.head].push_back(arc) this._heads[child] = head - void del_arc(int h_i, int c_i) nogil: - cdef vector[ArcC]* arcs - if h_i > c_i: - arcs = &this._left_arcs - else: - arcs = &this._right_arcs + void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil: + arcs_it = heads_arcs.find(h_i) + if arcs_it == heads_arcs.end(): + return + + arcs = &deref(arcs_it).second if arcs.size() == 0: return + arc = arcs.back() if arc.head == h_i and arc.child == c_i: arcs.pop_back() @@ -353,6 +367,12 @@ cdef cppclass StateC: arc.label = 0 break + void del_arc(int h_i, int c_i) nogil: + if h_i > c_i: + this.map_del_arc(&this._left_arcs, h_i, c_i) + else: + this.map_del_arc(&this._right_arcs, h_i, c_i) + SpanC get_ent() nogil const: cdef SpanC ent if this._ents.size() == 0: From 47ea6704f1045ee3a04ac7ffbfedba01d944e233 Mon Sep 17 00:00:00 2001 From: Natalia Rodnova <4512370+nrodnova@users.noreply.github.com> Date: Mon, 17 Jan 2022 03:17:49 -0700 Subject: [PATCH 12/12] Span richcmp fix (#9956) * Corrected Span's __richcmp__ implementation to take end, label and kb_id in consideration * Updated test * Updated test * Removed formatting from a test for readability sake * Use same tuples for all comparisons Co-authored-by: Adriane Boyd --- spacy/tests/doc/test_span.py | 49 ++++++++++++++++++++++++++++++++++++ spacy/tokens/span.pyx | 28 ++++++--------------- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 10aba5b94..bdf34c1c1 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -573,6 +573,55 @@ def test_span_with_vectors(doc): doc.vocab.vectors = prev_vectors +# fmt: off +def test_span_comparison(doc): + + # Identical start, end, only differ in label and kb_id + assert Span(doc, 0, 3) == Span(doc, 0, 3) + assert Span(doc, 0, 3, "LABEL") == Span(doc, 0, 3, "LABEL") + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") == Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL") + assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 0, 3, "LABEL") != Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3) <= Span(doc, 0, 3) and Span(doc, 0, 3) >= Span(doc, 0, 3) + assert Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL") and Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "LABEL") + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + assert (Span(doc, 0, 3) < Span(doc, 0, 3, "", kb_id="KB_ID") < Span(doc, 0, 3, "LABEL") < Span(doc, 0, 3, "LABEL", kb_id="KB_ID")) + assert (Span(doc, 0, 3) <= Span(doc, 0, 3, "", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")) + + assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") > Span(doc, 0, 3, "LABEL") > Span(doc, 0, 3, "", kb_id="KB_ID") > Span(doc, 0, 3)) + assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "", kb_id="KB_ID") >= Span(doc, 0, 3)) + + # Different end + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4) + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 4) + assert Span(doc, 0, 4) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 0, 4) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + # Different start + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3) + assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3) + assert Span(doc, 1, 3) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + assert Span(doc, 1, 3) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID") + + # Different start & different end + assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID") + + assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3) + assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3) + assert Span(doc, 1, 3) > Span(doc, 0, 4, "LABEL", kb_id="KB_ID") + assert Span(doc, 1, 3) >= Span(doc, 0, 4, "LABEL", kb_id="KB_ID") +# fmt: on + + @pytest.mark.parametrize( "start,end,expected_sentences,expected_sentences_with_hook", [ diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index cd02cab36..5484b25fd 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -126,38 +126,26 @@ cdef class Span: return False else: return True + self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.doc) + other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.doc) # < if op == 0: - return self.c.start_char < other.c.start_char + return self_tuple < other_tuple # <= elif op == 1: - return self.c.start_char <= other.c.start_char + return self_tuple <= other_tuple # == elif op == 2: - # Do the cheap comparisons first - return ( - (self.c.start_char == other.c.start_char) and \ - (self.c.end_char == other.c.end_char) and \ - (self.c.label == other.c.label) and \ - (self.c.kb_id == other.c.kb_id) and \ - (self.doc == other.doc) - ) + return self_tuple == other_tuple # != elif op == 3: - # Do the cheap comparisons first - return not ( - (self.c.start_char == other.c.start_char) and \ - (self.c.end_char == other.c.end_char) and \ - (self.c.label == other.c.label) and \ - (self.c.kb_id == other.c.kb_id) and \ - (self.doc == other.doc) - ) + return self_tuple != other_tuple # > elif op == 4: - return self.c.start_char > other.c.start_char + return self_tuple > other_tuple # >= elif op == 5: - return self.c.start_char >= other.c.start_char + return self_tuple >= other_tuple def __hash__(self): return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id))