From f0e8c9fe58267edaef3d82fe12ad5c0fb5c431e6 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu@explosion.ai>
Date: Fri, 5 Nov 2021 00:46:36 +0100
Subject: [PATCH] Spanish noun chunks review (#9537)

* updated syntax iters

* formatted the code

* added prepositional objects

* code clean up

* eliminated left attached adp

* added es vocab

* added basic tests

* fixed typo

* fixed typo

* list to set

* fixed doc name

* added code for conj

* more tests

* differentiated adjectives and flat

* fixed typo

* added compounds

* more compounds

* tests for compounds

* tests for nominal modifiers

* fixed typo

* fixed typo

* formatted file

* reformatted tests

* fixed typo

* fixed punct typo

* formatted after changes

* added indirect object

* added full sentence examples

* added longer full sentence examples

* fixed sentence length of test

* added passive subj

* added test case by Damian
---
 spacy/lang/es/syntax_iterators.py       | 106 ++++++++++-------
 spacy/tests/conftest.py                 |   5 +
 spacy/tests/lang/es/test_noun_chunks.py | 150 ++++++++++++++++++++++++
 3 files changed, 217 insertions(+), 44 deletions(-)

diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index 8b385a1b9..f2ca2a678 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -1,58 +1,76 @@
 from typing import Union, Iterator, Tuple
 
-from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
-from ...tokens import Doc, Span, Token
+from ...tokens import Doc, Span
 
 
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
-    doc = doclike.doc
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "fixed", "compound"]
+    doc = doclike.doc  # Ensure works on both Doc and Span.
     if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
-    if not len(doc):
-        return
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
     np_label = doc.vocab.strings.add("NP")
-    left_labels = ["det", "fixed", "neg"]  # ['nunmod', 'det', 'appos', 'fixed']
-    right_labels = ["flat", "fixed", "compound", "neg"]
-    stop_labels = ["punct"]
-    np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
-    np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
-    stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
+    adj_label = doc.vocab.strings.add("amod")
+    adp_label = doc.vocab.strings.add("ADP")
+    conj = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
+        if word.dep in np_deps:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
 
-    prev_right = -1
-    for token in doclike:
-        if token.pos in [PROPN, NOUN, PRON]:
-            left, right = noun_bounds(
-                doc, token, np_left_deps, np_right_deps, stop_deps
-            )
-            if left.i <= prev_right:
-                continue
-            yield left.i, right.i + 1, np_label
-            prev_right = right.i
-
-
-def is_verb_token(token: Token) -> bool:
-    return token.pos in [VERB, AUX]
-
-
-def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
-    left_bound = root
-    for token in reversed(list(root.lefts)):
-        if token.dep in np_left_deps:
-            left_bound = token
-    right_bound = root
-    for token in root.rights:
-        if token.dep in np_right_deps:
-            left, right = noun_bounds(
-                doc, token, np_left_deps, np_right_deps, stop_deps
-            )
-            filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
-            if list(filter(filter_func, doc[left_bound.i : right.i])):
-                break
+            if right_child:
+                if right_child.dep == adj_label:
+                    right_end = right_child.right_edge
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
             else:
-                right_bound = right
-    return left_bound, right_bound
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_label else left_index
+            )  # Eliminate left attached de, del
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
 
 
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index afe23888d..88c7adfe3 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -120,6 +120,11 @@ def es_tokenizer():
     return get_lang_class("es")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def es_vocab():
+    return get_lang_class("es")().vocab
+
+
 @pytest.fixture(scope="session")
 def eu_tokenizer():
     return get_lang_class("eu")().tokenizer
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
index e5afd81c9..6118a0458 100644
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -1,6 +1,156 @@
+from spacy.tokens import Doc
 import pytest
 
 
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # un gato -> "un gato"
+        (
+            ["un", "gato"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # la camisa negra -> "la camisa negra"
+        (
+            ["la", "camisa", "negra"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # un lindo gatito -> "un lindo gatito"
+        (
+            ["Un", "lindo", "gatito"],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # una chica hermosa e inteligente -> una chica hermosa e inteligente
+        (
+            ["Una", "chica", "hermosa", "e", "inteligente"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # el fabuloso gato pardo -> "el fabuloso gato pardo"
+        (
+            ["el", "fabuloso", "gato", "pardo"],
+            [2, 2, 2, 2],
+            ["det", "amod", "ROOT", "amod"],
+            ["DET", "ADJ", "NOUN", "ADJ"],
+            [(0,4)]
+        ),
+        # Tengo un gato y un perro -> un gato, un perro
+        ( 
+            ["Tengo", "un", "gato", "y", "un", "perro"],
+            [0, 2, 0, 5, 5, 0],
+            ["ROOT", "det", "obj", "cc", "det", "conj"],
+            ["VERB", "DET", "NOUN", "CCONJ", "DET", "NOUN"],
+            [(1,3), (4,6)]
+         
+        ),
+        # Dom Pedro II -> Dom Pedro II
+        (
+            ["Dom", "Pedro", "II"],
+            [0, 0, 0],
+            ["ROOT", "flat", "flat"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # los Estados Unidos -> los Estados Unidos
+        (
+            ["los", "Estados", "Unidos"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Miguel de Cervantes -> Miguel de Cervantes
+        (
+            ["Miguel", "de", "Cervantes"],
+            [0, 2, 0],
+            ["ROOT", "case", "flat"],
+            ["PROPN", "ADP", "PROPN"],
+            [(0,3)]
+        ),
+        (
+            ["Rio", "de", "Janeiro"],
+            [0, 2, 0],
+            ["ROOT", "case", "flat"],
+            ["PROPN", "ADP", "PROPN"],
+            [(0,3)]
+        ),
+        # la destrucción de la ciudad -> la destrucción, la ciudad
+        (
+            ["la", "destrucción", "de", "la", "ciudad"],
+            [1, 1, 4, 4, 1],
+            ['det', 'ROOT', 'case', 'det', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
+            [(0,2), (3,5)]
+        ),
+        # la traducción de Susana del informe -> la traducción, Susana, informe
+        (
+            ['la', 'traducción', 'de', 'Susana', 'del', 'informe'],
+            [1, 1, 3, 1, 5, 1],
+            ['det', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'PROPN', 'ADP', 'NOUN'],
+            [(0,2), (3,4), (5,6)]  
+       
+        ),
+        # El gato regordete de Susana y su amigo -> el gato regordete, Susana, su amigo
+        (  
+            ['El', 'gato', 'regordete', 'de', 'Susana', 'y', 'su', 'amigo'],
+            [1, 1, 1, 4, 1, 7, 7, 1],
+            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'],
+            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,8)]
+        ),
+        # Afirmó que sigue el criterio europeo y que trata de incentivar el mercado donde no lo hay -> el criterio europeo, el mercado, donde, lo
+        (
+            ['Afirmó', 'que', 'sigue', 'el', 'criterio', 'europeo', 'y', 'que', 'trata', 'de', 'incentivar', 'el', 'mercado', 'donde', 'no', 'lo', 'hay'],
+            [0, 2, 0, 4, 2, 4, 8, 8, 2, 10, 8, 12, 10, 16, 16, 16, 0],
+            ['ROOT', 'mark', 'ccomp', 'det', 'obj', 'amod', 'cc', 'mark', 'conj', 'mark', 'xcomp', 'det', 'obj', 'obl', 'advmod', 'obj', 'advcl'],
+            ['VERB', 'SCONJ', 'VERB', 'DET', 'NOUN', 'ADJ', 'CCONJ', 'SCONJ', 'VERB', 'ADP', 'VERB', 'DET', 'NOUN', 'PRON', 'ADV', 'PRON', 'AUX'],
+            [(3,6), (11,13), (13,14), (15,16)]
+        ),
+        # En este sentido se refirió a la reciente creación del Ministerio de Ciencia y Tecnología y a las primeras declaraciones de su titular, Anna Birulés, sobre el impulso de la investigación, desarrollo e innovación -> este sentido, se, la reciente creación, Ministerio de Ciencia y Tecnología, a las primeras declaraciones, su titular, , Anna Birulés,, el impulso, la investigación, , desarrollo, innovación
+        (
+            ['En', 'este', 'sentido', 'se', 'refirió', 'a', 'la', 'reciente', 'creación', 'del', 'Ministerio', 'de', 'Ciencia', 'y', 'Tecnología', 'y', 'a', 'las', 'primeras', 'declaraciones', 'de', 'su', 'titular', ',', 'Anna', 'Birulés', ',', 'sobre', 'el', 'impulso', 'de', 'la', 'investigación', ',', 'desarrollo', 'e', 'innovación'],
+            [2, 2, 4, 4, 4, 8, 8, 8, 4, 10, 8, 12, 10, 14, 12, 19, 19, 19, 19, 8, 22, 22, 19, 24, 22, 24, 24, 29, 29, 19, 32, 32, 29, 34, 32, 36, 32],
+            ['case', 'det', 'obl', 'obj', 'ROOT', 'case', 'det', 'amod', 'obj', 'case', 'nmod', 'case', 'flat', 'cc', 'conj', 'cc', 'case', 'det', 'amod', 'conj', 'case', 'det', 'nmod', 'punct', 'appos', 'flat', 'punct', 'case', 'det', 'nmod', 'case', 'det', 'nmod', 'punct', 'conj', 'cc', 'conj'],
+            ['ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'ADP', 'PROPN', 'CCONJ', 'PROPN', 'CCONJ', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'NOUN', 'CCONJ', 'NOUN'],
+            [(1, 3), (3, 4), (6, 9), (10, 15), (16, 20), (21, 23), (23, 27), (28, 30), (31, 33), (33, 35), (36, 37)]
+        ),
+        # Asimismo defiende la financiación pública de la investigación básica y pone de manifiesto que las empresas se centran más en la investigación y desarrollo con objetivos de mercado. -> la financiación pública, la investigación básica, manifiesto, las empresas, se, la investigación, desarrollo, objetivos, mercado
+        (
+            ['Asimismo', 'defiende', 'la', 'financiación', 'pública', 'de', 'la', 'investigación', 'básica', 'y', 'pone', 'de', 'manifiesto', 'que', 'las', 'empresas', 'se', 'centran', 'más', 'en', 'la', 'investigación', 'y', 'desarrollo', 'con', 'objetivos', 'de', 'mercado'],
+            [1, 1, 3, 1, 3, 7, 7, 3, 7, 10, 1, 12, 10, 17, 15, 17, 17, 10, 17, 21, 21, 17, 23, 21, 25, 17, 27, 25],
+            ['advmod', 'ROOT', 'det', 'obj', 'amod', 'case', 'det', 'nmod', 'amod', 'cc', 'conj', 'case', 'obl', 'mark', 'det', 'nsubj', 'obj', 'ccomp', 'obj', 'case', 'det', 'obl', 'cc', 'conj', 'case', 'obl', 'case', 'nmod'],
+            ['ADV', 'VERB', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADJ', 'CCONJ', 'VERB', 'ADP', 'NOUN', 'SCONJ', 'DET', 'NOUN', 'PRON', 'VERB', 'ADV', 'ADP', 'DET', 'NOUN', 'CCONJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(2, 5), (6, 9), (12, 13), (14, 16), (16, 17), (20, 22), (23, 24), (25, 26), (27, 28)]
+        ),
+        # Tras indicar que la inversión media en investigación en la Unión Europea se sitúa en el 1,8 por ciento del PIB, frente al 2,8 por ciento en Japón y EEUU, Couceiro dijo que España está en "el buen camino" y se está creando un entorno propicio para la innovación empresarial' -> la inversión media, investigación, la Unión Europea, se, PIB, Japón, EEUU, Couceiro, España, se, un entorno propicio para la innovación empresaria
+        (
+            ['Tras', 'indicar', 'que', 'la', 'inversión', 'media', 'en', 'investigación', 'en', 'la', 'Unión', 'Europea', 'se', 'sitúa', 'en', 'el', '1,8', 'por', 'ciento', 'del', 'PIB', ',', 'frente', 'al', '2,8', 'por', 'ciento', 'en', 'Japón', 'y', 'EEUU', ',', 'Couceiro', 'dijo', 'que', 'España', 'está', 'en', '"', 'el', 'buen', 'camino', '"', 'y', 'se', 'está', 'creando', 'un', 'entorno', 'propicio', 'para', 'la', 'innovación', 'empresarial'],
+            [1, 33, 13, 4, 13, 4, 7, 4, 10, 10, 4, 10, 13, 1, 16, 16, 13, 18, 16, 20, 16, 24, 24, 22, 13, 26, 24, 28, 24, 30, 28, 1, 33, 33, 41, 41, 41, 41, 41, 41, 41, 33, 41, 46, 46, 46, 33, 48, 46, 48, 52, 52, 49, 52],
+            ['mark', 'advcl', 'mark', 'det', 'nsubj', 'amod', 'case', 'nmod', 'case', 'det', 'nmod', 'flat', 'obj', 'ccomp', 'case', 'det', 'obj', 'case', 'compound', 'case', 'nmod', 'punct', 'case', 'fixed', 'obl', 'case', 'compound', 'case', 'nmod', 'cc', 'conj', 'punct', 'nsubj', 'ROOT', 'mark', 'nsubj', 'cop', 'case', 'punct', 'det', 'amod', 'ccomp', 'punct', 'cc', 'obj', 'aux', 'conj', 'det', 'nsubj', 'amod', 'case', 'det', 'nmod', 'amod'],
+            ['ADP', 'VERB', 'SCONJ', 'DET', 'NOUN', 'ADJ', 'ADP', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PRON', 'VERB', 'ADP', 'DET', 'NUM', 'ADP', 'NUM', 'ADP', 'PROPN', 'PUNCT', 'NOUN', 'ADP', 'NUM', 'ADP', 'NUM', 'ADP', 'PROPN', 'CCONJ', 'PROPN', 'PUNCT', 'PROPN', 'VERB', 'SCONJ', 'PROPN', 'AUX', 'ADP', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'CCONJ', 'PRON', 'AUX', 'VERB', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADJ'],
+            [(3, 6), (7, 8), (9, 12), (12, 13), (20, 21), (28, 29), (30, 31), (32, 33), (35, 36), (44, 45), (47, 54)]
+        ),
+    ],
+)
+# fmt: on
+def test_es_noun_chunks(es_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(es_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
 def test_noun_chunks_is_parsed_es(es_tokenizer):
     """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
     doc = es_tokenizer("en Oxford este verano")