diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 1edebc837..ecf322bd7 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ...language import Language, BaseDefaults from .lemmatizer import ItalianLemmatizer +from .syntax_iterators import SYNTAX_ITERATORS class ItalianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS - stop_words = STOP_WORDS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS class Italian(Language): diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py new file mode 100644 index 000000000..f63df3fad --- /dev/null +++ b/spacy/lang/it/syntax_iterators.py @@ -0,0 +1,86 @@ +from typing import Union, Iterator, Tuple + +from ...symbols import NOUN, PROPN, PRON +from ...errors import Errors +from ...tokens import Doc, Span + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + labels = [ + "nsubj", + "nsubj:pass", + "obj", + "obl", + "obl:agent", + "nmod", + "pcomp", + "appos", + "ROOT", + ] + post_modifiers = ["flat", "flat:name", "fixed", "compound"] + dets = ["det", "det:poss"] + doc = doclike.doc # Ensure works on both Doc and Span. + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + np_deps = {doc.vocab.strings.add(label) for label in labels} + np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} + np_label = doc.vocab.strings.add("NP") + adj_label = doc.vocab.strings.add("amod") + det_labels = {doc.vocab.strings.add(det) for det in dets} + det_pos = doc.vocab.strings.add("DET") + adp_label = doc.vocab.strings.add("ADP") + conj = doc.vocab.strings.add("conj") + conj_pos = doc.vocab.strings.add("CCONJ") + prev_end = -1 + for i, word in enumerate(doclike): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.left_edge.i <= prev_end: + continue + if word.dep in np_deps: + right_childs = list(word.rights) + right_child = right_childs[0] if right_childs else None + + if right_child: + if ( + right_child.dep == adj_label + ): # allow chain of adjectives by expanding to right + right_end = right_child.right_edge + elif ( + right_child.dep in det_labels and right_child.pos == det_pos + ): # cut relative pronouns here + right_end = right_child + elif right_child.dep in np_modifs: # Check if we can expand to right + right_end = word.right_edge + else: + right_end = word + else: + right_end = word + prev_end = right_end.i + + left_index = word.left_edge.i + left_index = ( + left_index + 1 if word.left_edge.pos == adp_label else left_index + ) + + yield left_index, right_end.i + 1, np_label + elif word.dep == conj: + head = word.head + while head.dep == conj and head.head.i < head.i: + head = head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + prev_end = word.i + + left_index = word.left_edge.i # eliminate left attached conjunction + left_index = ( + left_index + 1 if word.left_edge.pos == conj_pos else left_index + ) + yield left_index, word.i + 1, np_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 88c7adfe3..2e75f9964 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -190,6 +190,11 @@ def it_tokenizer(): return get_lang_class("it")().tokenizer +@pytest.fixture(scope="session") +def it_vocab(): + return get_lang_class("it")().vocab + + @pytest.fixture(scope="session") def ja_tokenizer(): pytest.importorskip("sudachipy") diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py new file mode 100644 index 000000000..0a8c10e79 --- /dev/null +++ b/spacy/tests/lang/it/test_noun_chunks.py @@ -0,0 +1,221 @@ +from spacy.tokens import Doc +import pytest + + +# fmt: off +@pytest.mark.parametrize( + "words,heads,deps,pos,chunk_offsets", + [ + # determiner + noun + # un pollo -> un pollo + ( + ["un", "pollo"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0,2)], + ), + # two determiners + noun + # il mio cane -> il mio cane + ( + ["il", "mio", "cane"], + [2, 2, 2], + ["det", "det:poss", "ROOT"], + ["DET", "DET", "NOUN"], + [(0,3)], + ), + # two determiners, one is after noun. rare usage but still testing + # il cane mio-> il cane mio + ( + ["il", "cane", "mio"], + [1, 1, 1], + ["det", "ROOT", "det:poss"], + ["DET", "NOUN", "DET"], + [(0,3)], + ), + # relative pronoun + # È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty. + ( + ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"], + [2, 2, 2, 4, 2, 7, 7, 4], + ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'], + ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'], + [(3,5), (5,6)] + ), + # relative subclause + # il computer che hai comprato -> il computer, che the computer that you bought + ( + ['il', 'computer', 'che', 'hai', 'comprato'], + [1, 1, 4, 4, 1], + ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'], + ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'], + [(0,2), (2,3)] + ), + # det + noun + adj + # Una macchina grande -> Una macchina grande + ( + ["Una", "macchina", "grande"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0,3)], + ), + # noun + adj plural + # mucche bianche + ( + ["mucche", "bianche"], + [0, 0], + ["ROOT", "amod"], + ["NOUN", "ADJ"], + [(0,2)], + ), + # det + adj + noun + # Una grande macchina -> Una grande macchina + ( + ['Una', 'grande', 'macchina'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # det + adj + noun, det with apostrophe + # un'importante associazione -> un'importante associazione + ( + ["Un'", 'importante', 'associazione'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # multiple adjectives + # Un cane piccolo e marrone -> Un cane piccolo e marrone + ( + ["Un", "cane", "piccolo", "e", "marrone"], + [1, 1, 1, 4, 2], + ["det", "ROOT", "amod", "cc", "conj"], + ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], + [(0,5)] + ), + # determiner, adjective, compound created by flat + # le Nazioni Unite -> le Nazioni Unite + ( + ["le", "Nazioni", "Unite"], + [1, 1, 1], + ["det", "ROOT", "flat:name"], + ["DET", "PROPN", "PROPN"], + [(0,3)] + ), + # one determiner + one noun + one adjective qualified by an adverb + # alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers + ( + ['alcuni', 'contadini', 'molto', 'ricchi'], + [1, 1, 3, 1], + ['det', 'ROOT', 'advmod', 'amod'], + ['DET', 'NOUN', 'ADV', 'ADJ'], + [(0,4)] + ), + # Two NPs conjuncted + # Ho un cane e un gatto -> un cane, un gatto + ( + ['Ho', 'un', 'cane', 'e', 'un', 'gatto'], + [0, 2, 0, 5, 5, 0], + ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'], + ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], + [(1,3), (4,6)] + + ), + # Two NPs together + # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado + ( + ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'], + [1, 1, 1, 1, 3], + ['det', 'ROOT', 'amod', 'nmod', 'flat:name'], + ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], + [(0, 3), (3, 5)] + ), + # Noun compound, person name and titles + # Dom Pedro II -> Dom Pedro II + ( + ["Dom", "Pedro", "II"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # Noun compound created by flat + # gli Stati Uniti + ( + ["gli", "Stati", "Uniti"], + [1, 1, 1], + ["det", "ROOT", "flat:name"], + ["DET", "PROPN", "PROPN"], + [(0,3)] + ), + # nmod relation between NPs + # la distruzione della città -> la distruzione, città + ( + ['la', 'distruzione', 'della', 'città'], + [1, 1, 3, 1], + ['det', 'ROOT', 'case', 'nmod'], + ['DET', 'NOUN', 'ADP', 'NOUN'], + [(0,2), (3,4)] + ), + # Compounding by nmod, several NPs chained together + # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo + ( + ["la", "prima", "fabbrica", "di", "droga", "del", "governo"], + [2, 2, 2, 4, 2, 6, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(0, 3), (4, 5), (6, 7)] + ), + # several NPs + # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana + ( + ['Traduzione', 'del', 'rapporto', 'di', 'Susana'], + [0, 2, 0, 4, 2], + ['ROOT', 'case', 'nmod', 'case', 'nmod'], + ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3), (4,5)] + + ), + # Several NPs + # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica + ( + ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'], + [1, 1, 1, 4, 1, 8, 8, 8, 1], + ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'], + ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'], + [(0,3), (4,5), (6,9)] + ), + # Passive subject + # La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton + ( + ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'], + [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9], + ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0, 3), (6, 8), (9, 10), (11,12)] + ), + # Misc + # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti + ( + ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'], + [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17], + ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'], + ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(2,4), (9,12), (13,14), (17,18), (19,20)] + ) + ], +) +# fmt: on +def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets): + doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos) + assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets + + +def test_noun_chunks_is_parsed_it(it_tokenizer): + """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed.""" + doc = it_tokenizer("Sei andato a Oxford") + with pytest.raises(ValueError): + list(doc.noun_chunks)