Portuguese noun chunks review (#9559)

* added tests * added pt vocab * transferred spanish * added syntax iters * fixed parenthesis * added nmod example * added relative pron * fixed rel pron * added rel subclause * corrected typo * added more NP chains * long sentence * fixed typo * fixed typo * fixed typo * corrected heads * added passive subj * added pass subj * added passive obj * refinement to rights * went back to odl * fixed test * fixed typo * fixed typo * formatted * Format * Format test cases Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-10-16 00:36:58 +03:00 · 2021-11-04 23:55:49 +01:00 · 2021-11-04 23:55:49 +01:00 · 6e6650307d
commit 6e6650307d
parent 2bf52c44b1
4 changed files with 313 additions and 0 deletions
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -1,6 +1,7 @@
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from ...language import Language, BaseDefaults
@ -10,6 +11,7 @@ class PortugueseDefaults(BaseDefaults):
    infixes = TOKENIZER_INFIXES
    prefixes = TOKENIZER_PREFIXES
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
    stop_words = STOP_WORDS
--- a/spacy/lang/pt/syntax_iterators.py
+++ b/spacy/lang/pt/syntax_iterators.py
@ -0,0 +1,85 @@
 from typing import Union, Iterator, Tuple
 from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
 from ...tokens import Doc, Span
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
    """
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
    """
    labels = [
        "nsubj",
        "nsubj:pass",
        "obj",
        "obl",
        "obl:agent",
        "nmod",
        "pcomp",
        "appos",
        "ROOT",
    ]
    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
    doc = doclike.doc  # Ensure works on both Doc and Span.
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = {doc.vocab.strings.add(label) for label in labels}
    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
    np_label = doc.vocab.strings.add("NP")
    adj_label = doc.vocab.strings.add("amod")
    det_label = doc.vocab.strings.add("det")
    det_pos = doc.vocab.strings.add("DET")
    adp_label = doc.vocab.strings.add("ADP")
    conj = doc.vocab.strings.add("conj")
    conj_pos = doc.vocab.strings.add("CCONJ")
    prev_end = -1
    for i, word in enumerate(doclike):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        # Prevent nested chunks from being produced
        if word.left_edge.i <= prev_end:
            continue
        if word.dep in np_deps:
            right_childs = list(word.rights)
            right_child = right_childs[0] if right_childs else None
            if right_child:
                if (
                    right_child.dep == adj_label
                ):  # allow chain of adjectives by expanding to right
                    right_end = right_child.right_edge
                elif (
                    right_child.dep == det_label and right_child.pos == det_pos
                ):  # cut relative pronouns here
                    right_end = right_child
                elif right_child.dep in np_modifs:  # Check if we can expand to right
                    right_end = word.right_edge
                else:
                    right_end = word
            else:
                right_end = word
            prev_end = right_end.i
            left_index = word.left_edge.i
            left_index = (
                left_index + 1 if word.left_edge.pos == adp_label else left_index
            )
            yield left_index, right_end.i + 1, np_label
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                prev_end = word.i
                left_index = word.left_edge.i  # eliminate left attached conjunction
                left_index = (
                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
                )
                yield left_index, word.i + 1, np_label
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -247,6 +247,11 @@ def pt_tokenizer():
    return get_lang_class("pt")().tokenizer
@pytest.fixture(scope="session")
 def pt_vocab():
    return get_lang_class("pt")().vocab
@pytest.fixture(scope="session")
 def ro_tokenizer():
    return get_lang_class("ro")().tokenizer
--- a/spacy/tests/lang/pt/test_noun_chunks.py
+++ b/spacy/tests/lang/pt/test_noun_chunks.py
@ -0,0 +1,221 @@
 from spacy.tokens import Doc
 import pytest
 # fmt: off
@pytest.mark.parametrize(
    "words,heads,deps,pos,chunk_offsets",
    [
        # determiner + noun
        # um cachorro -> um cachorro
        (
            ["um", "cachorro"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0, 2)],
        ),
        # two determiners + noun
        # meu o pai -> meu o pai
        (
            ["meu", "o", "pai"],
            [2, 2, 2],
            ["det", "det", "ROOT"],
            ["DET", "DET", "NOUN"],
            [(0, 3)],
        ),
        # two determiners + noun
        # todos essos caros -> todos essos caros
        (
            ["todos", "essos", "caros"],
            [2, 2, 2],
            ["det", "det", "ROOT"],
            ["DET", "DET", "NOUN"],
            [(0, 3)],
        ),
        # two determiners, one is after noun
        # um irmão meu -> um irmão meu
        (
            ["um", "irmão", "meu"],
            [1, 1, 1],
            ["det", "ROOT", "det"],
            ["DET", "NOUN", "DET"],
            [(0, 3)],
        ),
        # two determiners + noun
        # o meu pai -> o meu pai
        (
            ["o", "meu", "pai"],
            [2, 2, 2],
            ["det","det", "ROOT"],
            ["DET", "DET", "NOUN"],
            [(0, 3)],
        ),
        # relative pronoun
        # A bicicleta essa está estragada -> A bicicleta
        (
            ['A', 'bicicleta', 'essa', 'está', 'estragada'],
            [1, 4, 1, 4, 4],
            ['det', 'nsubj', 'det', 'cop', 'ROOT'],
            ['DET', 'NOUN', 'PRON', 'AUX', 'ADJ'],
            [(0,2)]
        ),
        # relative subclause
        #  o computador que comprou -> o computador
        (
            ['o', 'computador', 'que', 'comprou'],
            [1, 1, 3, 1],
            ['det', 'ROOT', 'nsubj', 'acl:relcl'],
            ['DET', 'NOUN', 'PRON', 'VERB'],
            [(0, 2), (2, 3)]
        ),
        # det + noun + adj
        # O cachorro marrom  -> O cachorro marrom
        (
            ["O", "cachorro", "marrom"],
            [1, 1, 1],
            ["det", "ROOT", "amod"],
            ["DET", "NOUN", "ADJ"],
            [(0, 3)],
        ),
        # det + noun + adj plural
        # As calças baratas  -> As calças baratas
        (
            ["As", "calças", "baratas"],
            [1, 1, 1],
            ["det", "ROOT", "amod"],
            ["DET", "NOUN", "ADJ"],
            [(0, 3)],
        ),
        # det + adj + noun
        # Uma boa ideia -> Uma boa ideia
        (
            ['uma', 'boa', 'ideia'],
            [2, 2, 2],
            ["det", "amod", "ROOT"],
            ["DET", "ADJ", "NOUN"],
            [(0,3)]
        ),
        # multiple adjectives
        # Uma garota esperta e inteligente -> Uma garota esperta e inteligente
        (
            ["Uma", "garota", "esperta", "e", "inteligente"],
            [1, 1, 1, 4, 2],
            ["det", "ROOT", "amod", "cc", "conj"],
            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
            [(0,5)]
        ),
        # determiner, adjective, compound created by flat
        # a grande São Paolo -> a grande São Paolo
        (
            ["a", "grande", "São", "Paolo"],
            [2, 2, 2, 2],
            ["det", "amod", "ROOT", "flat:name"],
            ["DET", "ADJ", "PROPN", "PROPN"],
            [(0,4)]
        ),
        # one determiner + one noun + one adjective qualified by an adverb
        # alguns fazendeiros muito ricos -> alguns fazendeiros muito ricos
        (
            ['alguns', 'fazendeiros', 'muito', 'ricos'],
            [1, 1, 3, 1],
            ['det', 'ROOT', 'advmod', 'amod'],
            ['DET', 'NOUN', 'ADV', 'ADJ'],
            [(0,4)]
        ),
        # Two NPs conjuncted
        # Eu tenho um cachorro e um gato -> Eu, um cacharo, um gato
        ( 
            ["Eu", "tenho", "um", "cachorro", "e", "um", "gato"],
            [1, 1, 3, 1, 6, 6, 3],
            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
            [(0,1), (2,4), (5,7)]
        ),
        # Two NPs together
        # o escritor brasileiro Aníbal Machado -> o escritor brasileiro, Aníbal Machado
        (
            ['o', 'escritor', 'brasileiro', 'Aníbal', 'Machado'],
            [1, 1, 1, 1, 3],
            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
            [(0, 3), (3, 5)]
        ),
        # Noun compound, person name and titles
        # Dom Pedro II -> Dom Pedro II
        (
            ["Dom", "Pedro", "II"],
            [0, 0, 0],
            ["ROOT", "flat:name", "flat:name"],
            ["PROPN", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # Noun compound created by flat
        # os Estados Unidos -> os Estados Unidos
        (
            ["os", "Estados", "Unidos"],
            [1, 1, 1],
            ["det", "ROOT", "flat:name"],
            ["DET", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # nmod relation between NPs
        # a destruição da cidade -> a destruição, cidade
        (
            ['a', 'destruição', 'da', 'cidade'],
            [1, 1, 3, 1],
            ['det', 'ROOT', 'case', 'nmod'],
            ['DET', 'NOUN', 'ADP', 'NOUN'],
            [(0,2), (3,4)]
        ),
        # Compounding by nmod, several NPs chained together
        # a primeira fábrica de medicamentos do governo -> a primeira fábrica, medicamentos, governo
        (
            ["a", "primeira", "fábrica", "de", "medicamentos",  "do", "governo"],
            [2, 2, 2, 4, 2, 6, 2],
            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
            [(0, 3), (4, 5), (6, 7)]
        ),
        # several NPs
        # Tradução da reportagem de Susana -> Tradução, reportagem, Susana
        (
            ['Tradução', 'da', 'reportagem', 'de', 'Susana'],
            [0, 2, 0, 4, 2],
            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
            [(0,1), (2,3), (4,5)]  
        ),
        # Several NPs
        # O gato gordo da Susana e seu amigo -> O gato gordo, Susana, seu amigo
        (  
            ['O', 'gato', 'gordo', 'da', 'Susana', 'e', 'seu', 'amigo'],
            [1, 1, 1, 4, 1, 7, 7, 1],
            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'],
            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
            [(0,3), (4,5), (6,8)]
        ),
        # Passive subject
        # Os novos gastos são alimentados pela grande conta bancária de Clinton -> Os novos gastos, grande conta bancária, Clinton
        (
            ['Os', 'novos', 'gastos', 'são', 'alimentados', 'pela', 'grande', 'conta', 'bancária', 'de', 'Clinton'],
            [2, 2, 4, 4, 4, 7, 7, 4, 7, 10, 7],
            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
            [(0, 3), (6, 9), (10, 11)]
        )
    ],
 )
 # fmt: on
 def test_pt_noun_chunks(pt_vocab, words, heads, deps, pos, chunk_offsets):
    doc = Doc(pt_vocab, words=words, heads=heads, deps=deps, pos=pos)
    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
 def test_noun_chunks_is_parsed_pt(pt_tokenizer):
    """Test that noun_chunks raises Value Error for 'pt' language if Doc is not parsed."""
    doc = pt_tokenizer("en Oxford este verano")
    with pytest.raises(ValueError):
        list(doc.noun_chunks)