Portuguese noun chunks review (#9559)

* added tests

* added pt vocab

* transferred spanish

* added syntax iters

* fixed parenthesis

* added nmod example

* added relative pron

* fixed rel pron

* added rel subclause

* corrected typo

* added more NP chains

* long sentence

* fixed typo

* fixed typo

* fixed typo

* corrected heads

* added passive subj

* added pass subj

* added passive obj

* refinement to rights

* went back to odl

* fixed test

* fixed typo

* fixed typo

* formatted

* Format

* Format test cases

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Duygu Altinok 2021-11-04 23:55:49 +01:00 committed by GitHub
parent 2bf52c44b1
commit 6e6650307d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 313 additions and 0 deletions

View File

@ -1,6 +1,7 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
@ -10,6 +11,7 @@ class PortugueseDefaults(BaseDefaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -0,0 +1,85 @@
from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_label = doc.vocab.strings.add("det")
det_pos = doc.vocab.strings.add("DET")
adp_label = doc.vocab.strings.add("ADP")
conj = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep == det_label and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_label else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -247,6 +247,11 @@ def pt_tokenizer():
return get_lang_class("pt")().tokenizer return get_lang_class("pt")().tokenizer
@pytest.fixture(scope="session")
def pt_vocab():
return get_lang_class("pt")().vocab
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ro_tokenizer(): def ro_tokenizer():
return get_lang_class("ro")().tokenizer return get_lang_class("ro")().tokenizer

View File

@ -0,0 +1,221 @@
from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# um cachorro -> um cachorro
(
["um", "cachorro"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# two determiners + noun
# meu o pai -> meu o pai
(
["meu", "o", "pai"],
[2, 2, 2],
["det", "det", "ROOT"],
["DET", "DET", "NOUN"],
[(0, 3)],
),
# two determiners + noun
# todos essos caros -> todos essos caros
(
["todos", "essos", "caros"],
[2, 2, 2],
["det", "det", "ROOT"],
["DET", "DET", "NOUN"],
[(0, 3)],
),
# two determiners, one is after noun
# um irmão meu -> um irmão meu
(
["um", "irmão", "meu"],
[1, 1, 1],
["det", "ROOT", "det"],
["DET", "NOUN", "DET"],
[(0, 3)],
),
# two determiners + noun
# o meu pai -> o meu pai
(
["o", "meu", "pai"],
[2, 2, 2],
["det","det", "ROOT"],
["DET", "DET", "NOUN"],
[(0, 3)],
),
# relative pronoun
# A bicicleta essa está estragada -> A bicicleta
(
['A', 'bicicleta', 'essa', 'está', 'estragada'],
[1, 4, 1, 4, 4],
['det', 'nsubj', 'det', 'cop', 'ROOT'],
['DET', 'NOUN', 'PRON', 'AUX', 'ADJ'],
[(0,2)]
),
# relative subclause
# o computador que comprou -> o computador
(
['o', 'computador', 'que', 'comprou'],
[1, 1, 3, 1],
['det', 'ROOT', 'nsubj', 'acl:relcl'],
['DET', 'NOUN', 'PRON', 'VERB'],
[(0, 2), (2, 3)]
),
# det + noun + adj
# O cachorro marrom -> O cachorro marrom
(
["O", "cachorro", "marrom"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# det + noun + adj plural
# As calças baratas -> As calças baratas
(
["As", "calças", "baratas"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# det + adj + noun
# Uma boa ideia -> Uma boa ideia
(
['uma', 'boa', 'ideia'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# multiple adjectives
# Uma garota esperta e inteligente -> Uma garota esperta e inteligente
(
["Uma", "garota", "esperta", "e", "inteligente"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# determiner, adjective, compound created by flat
# a grande São Paolo -> a grande São Paolo
(
["a", "grande", "São", "Paolo"],
[2, 2, 2, 2],
["det", "amod", "ROOT", "flat:name"],
["DET", "ADJ", "PROPN", "PROPN"],
[(0,4)]
),
# one determiner + one noun + one adjective qualified by an adverb
# alguns fazendeiros muito ricos -> alguns fazendeiros muito ricos
(
['alguns', 'fazendeiros', 'muito', 'ricos'],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Eu tenho um cachorro e um gato -> Eu, um cacharo, um gato
(
["Eu", "tenho", "um", "cachorro", "e", "um", "gato"],
[1, 1, 3, 1, 6, 6, 3],
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(0,1), (2,4), (5,7)]
),
# Two NPs together
# o escritor brasileiro Aníbal Machado -> o escritor brasileiro, Aníbal Machado
(
['o', 'escritor', 'brasileiro', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# Noun compound, person name and titles
# Dom Pedro II -> Dom Pedro II
(
["Dom", "Pedro", "II"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound created by flat
# os Estados Unidos -> os Estados Unidos
(
["os", "Estados", "Unidos"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# nmod relation between NPs
# a destruição da cidade -> a destruição, cidade
(
['a', 'destruição', 'da', 'cidade'],
[1, 1, 3, 1],
['det', 'ROOT', 'case', 'nmod'],
['DET', 'NOUN', 'ADP', 'NOUN'],
[(0,2), (3,4)]
),
# Compounding by nmod, several NPs chained together
# a primeira fábrica de medicamentos do governo -> a primeira fábrica, medicamentos, governo
(
["a", "primeira", "fábrica", "de", "medicamentos", "do", "governo"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Tradução da reportagem de Susana -> Tradução, reportagem, Susana
(
['Tradução', 'da', 'reportagem', 'de', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# O gato gordo da Susana e seu amigo -> O gato gordo, Susana, seu amigo
(
['O', 'gato', 'gordo', 'da', 'Susana', 'e', 'seu', 'amigo'],
[1, 1, 1, 4, 1, 7, 7, 1],
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'],
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
[(0,3), (4,5), (6,8)]
),
# Passive subject
# Os novos gastos são alimentados pela grande conta bancária de Clinton -> Os novos gastos, grande conta bancária, Clinton
(
['Os', 'novos', 'gastos', 'são', 'alimentados', 'pela', 'grande', 'conta', 'bancária', 'de', 'Clinton'],
[2, 2, 4, 4, 4, 7, 7, 4, 7, 10, 7],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
[(0, 3), (6, 9), (10, 11)]
)
],
)
# fmt: on
def test_pt_noun_chunks(pt_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(pt_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_pt(pt_tokenizer):
"""Test that noun_chunks raises Value Error for 'pt' language if Doc is not parsed."""
doc = pt_tokenizer("en Oxford este verano")
with pytest.raises(ValueError):
list(doc.noun_chunks)