diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index d86662693..5f7ba5c10 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -6,16 +6,35 @@ from ...tokens import Doc, Span def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: - """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" - # fmt: off - labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] - # fmt: on + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + labels = [ + "nsubj", + "nsubj:pass", + "obj", + "obl", + "obl:agent", + "obl:arg", + "obl:mod", + "nmod", + "pcomp", + "appos", + "ROOT", + ] + post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"] doc = doclike.doc # Ensure works on both Doc and Span. if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) - np_deps = [doc.vocab.strings[label] for label in labels] - conj = doc.vocab.strings.add("conj") + np_deps = {doc.vocab.strings.add(label) for label in labels} + np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers} np_label = doc.vocab.strings.add("NP") + adj_label = doc.vocab.strings.add("amod") + det_label = doc.vocab.strings.add("det") + det_pos = doc.vocab.strings.add("DET") + adp_pos = doc.vocab.strings.add("ADP") + conj_label = doc.vocab.strings.add("conj") + conj_pos = doc.vocab.strings.add("CCONJ") prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): @@ -24,16 +43,45 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i - yield word.left_edge.i, word.right_edge.i + 1, np_label - elif word.dep == conj: + right_childs = list(word.rights) + right_child = right_childs[0] if right_childs else None + + if right_child: + if ( + right_child.dep == adj_label + ): # allow chain of adjectives by expanding to right + right_end = right_child.right_edge + elif ( + right_child.dep == det_label and right_child.pos == det_pos + ): # cut relative pronouns here + right_end = right_child + elif right_child.dep in np_modifs: # Check if we can expand to right + right_end = word.right_edge + else: + right_end = word + else: + right_end = word + prev_end = right_end.i + + left_index = word.left_edge.i + left_index = ( + left_index + 1 if word.left_edge.pos == adp_pos else left_index + ) + + yield left_index, right_end.i + 1, np_label + elif word.dep == conj_label: head = word.head - while head.dep == conj and head.head.i < head.i: + while head.dep == conj_label and head.head.i < head.i: head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i - yield word.left_edge.i, word.right_edge.i + 1, np_label + prev_end = word.i + + left_index = word.left_edge.i # eliminate left attached conjunction + left_index = ( + left_index + 1 if word.left_edge.pos == conj_pos else left_index + ) + yield left_index, word.i + 1, np_label SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 2e75f9964..002a8f027 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -145,6 +145,11 @@ def fr_tokenizer(): return get_lang_class("fr")().tokenizer +@pytest.fixture(scope="session") +def fr_vocab(): + return get_lang_class("fr")().vocab + + @pytest.fixture(scope="session") def ga_tokenizer(): return get_lang_class("ga")().tokenizer diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index 48ac88ead..25b95f566 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -1,8 +1,230 @@ +from spacy.tokens import Doc import pytest +# fmt: off +@pytest.mark.parametrize( + "words,heads,deps,pos,chunk_offsets", + [ + # determiner + noun + # un nom -> un nom + ( + ["un", "nom"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # determiner + noun starting with vowel + # l'heure -> l'heure + ( + ["l'", "heure"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # determiner + plural noun + # les romans -> les romans + ( + ["les", "romans"], + [1, 1], + ["det", "ROOT"], + ["DET", "NOUN"], + [(0, 2)], + ), + # det + adj + noun + # Le vieux Londres -> Le vieux Londres + ( + ['Les', 'vieux', 'Londres'], + [2, 2, 2], + ["det", "amod", "ROOT"], + ["DET", "ADJ", "NOUN"], + [(0,3)] + ), + # det + noun + adj + # le nom propre -> le nom propre a proper noun + ( + ["le", "nom", "propre"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0, 3)], + ), + # det + noun + adj plural + # Les chiens bruns -> les chiens bruns + ( + ["Les", "chiens", "bruns"], + [1, 1, 1], + ["det", "ROOT", "amod"], + ["DET", "NOUN", "ADJ"], + [(0, 3)], + ), + # multiple adjectives: one adj before the noun, one adj after the noun + # un nouveau film intéressant -> un nouveau film intéressant + ( + ["un", "nouveau", "film", "intéressant"], + [2, 2, 2, 2], + ["det", "amod", "ROOT", "amod"], + ["DET", "ADJ", "NOUN", "ADJ"], + [(0,4)] + ), + # multiple adjectives, both adjs after the noun + # une personne intelligente et drôle -> une personne intelligente et drôle + ( + ["une", "personne", "intelligente", "et", "drôle"], + [1, 1, 1, 4, 2], + ["det", "ROOT", "amod", "cc", "conj"], + ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"], + [(0,5)] + ), + # relative pronoun + # un bus qui va au ville -> un bus, qui, ville + ( + ['un', 'bus', 'qui', 'va', 'au', 'ville'], + [1, 1, 3, 1, 5, 3], + ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'], + ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'], + [(0,2), (2,3), (5,6)] + ), + # relative subclause + # Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy. + ( + ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'], + [0, 2, 0, 5, 5, 2, 5], + ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'], + ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'], + [(1,3), (4,5)] + ), + # Person name and title by flat + # Louis XIV -> Louis XIV + ( + ["Louis", "XIV"], + [0, 0], + ["ROOT", "flat:name"], + ["PROPN", "PROPN"], + [(0,2)] + ), + # Organization name by flat + # Nations Unies -> Nations Unies + ( + ["Nations", "Unies"], + [0, 0], + ["ROOT", "flat:name"], + ["PROPN", "PROPN"], + [(0,2)] + ), + # Noun compound, person name created by two flats + # Louise de Bratagne -> Louise de Bratagne + ( + ["Louise", "de", "Bratagne"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # Noun compound, person name created by two flats + # Louis François Joseph -> Louis François Joseph + ( + ["Louis", "François", "Joseph"], + [0, 0, 0], + ["ROOT", "flat:name", "flat:name"], + ["PROPN", "PROPN", "PROPN"], + [(0,3)] + ), + # one determiner + one noun + one adjective qualified by an adverb + # quelques agriculteurs très riches -> quelques agriculteurs très riches + ( + ["quelques", "agriculteurs", "très", "riches"], + [1, 1, 3, 1], + ['det', 'ROOT', 'advmod', 'amod'], + ['DET', 'NOUN', 'ADV', 'ADJ'], + [(0,4)] + ), + # Two NPs conjuncted + # Il a un chien et un chat -> Il, un chien, un chat + ( + ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'], + [1, 1, 3, 1, 6, 6, 3], + ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'], + ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], + [(0,1), (2,4), (5,7)] + + ), + # Two NPs together + # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado + ( + ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'], + [1, 1, 1, 1, 3], + ['det', 'ROOT', 'amod', 'appos', 'flat:name'], + ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'], + [(0, 3), (3, 5)] + ), + # nmod relation between NPs + # la destruction de la ville -> la destruction, la ville + ( + ['la', 'destruction', 'de', 'la', 'ville'], + [1, 1, 4, 4, 1], + ['det', 'ROOT', 'case', 'det', 'nmod'], + ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'], + [(0,2), (3,5)] + ), + # nmod relation between NPs + # Archiduchesse d’Autriche -> Archiduchesse, Autriche + ( + ['Archiduchesse', 'd’', 'Autriche'], + [0, 2, 0], + ['ROOT', 'case', 'nmod'], + ['NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3)] + ), + # Compounding by nmod, several NPs chained together + # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement + ( + ["la", "première", "usine", "de", "drogue", "du", "gouvernement"], + [2, 2, 2, 4, 2, 6, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'], + [(0, 3), (4, 5), (6, 7)] + ), + # several NPs + # Traduction du rapport de Susana -> Traduction, rapport, Susana + ( + ['Traduction', 'du', 'raport', 'de', 'Susana'], + [0, 2, 0, 4, 2], + ['ROOT', 'case', 'nmod', 'case', 'nmod'], + ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], + [(0,1), (2,3), (4,5)] + + ), + # Several NPs + # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie + ( + ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'], + [2, 2, 2, 4, 2, 7, 7, 2], + ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'], + ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'], + [(0,3), (4,5), (6,8)] + ), + # Passive subject + # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton + ( + ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'], + [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8], + ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'], + ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'], + [(0, 3), (6, 10), (11, 12)] + ) + ], +) +# fmt: on +def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets): + doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos) + assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets + + def test_noun_chunks_is_parsed_fr(fr_tokenizer): """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.""" - doc = fr_tokenizer("trouver des travaux antérieurs") + doc = fr_tokenizer("Je suis allé à l'école") with pytest.raises(ValueError): list(doc.noun_chunks)