From acc950807e0b2c72c8ee9143a4e23558134092bc Mon Sep 17 00:00:00 2001 From: "Patrick J. Burns" Date: Sun, 9 Apr 2023 07:06:08 -0400 Subject: [PATCH] Add noun chunking to la syntax iterators --- spacy/lang/la/__init__.py | 2 + spacy/lang/la/syntax_iterators.py | 85 +++++++++++++++++++++++++ spacy/tests/lang/la/test_noun_chunks.py | 52 +++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 spacy/lang/la/syntax_iterators.py create mode 100644 spacy/tests/lang/la/test_noun_chunks.py diff --git a/spacy/lang/la/__init__.py b/spacy/lang/la/__init__.py index 15b87c5b9..37164c3f3 100644 --- a/spacy/lang/la/__init__.py +++ b/spacy/lang/la/__init__.py @@ -2,12 +2,14 @@ from ...language import Language, BaseDefaults from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .syntax_iterators import SYNTAX_ITERATORS class LatinDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS class Latin(Language): diff --git a/spacy/lang/la/syntax_iterators.py b/spacy/lang/la/syntax_iterators.py new file mode 100644 index 000000000..8e36c051d --- /dev/null +++ b/spacy/lang/la/syntax_iterators.py @@ -0,0 +1,85 @@ +# NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB] + +from typing import Union, Iterator, Tuple +from ...tokens import Doc, Span +from ...symbols import NOUN, PROPN, PRON, VERB, AUX +from ...errors import Errors + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + def is_verb_token(tok): + return tok.pos in [VERB, AUX] + + def get_left_bound(doc, root): + left_bound = root + for tok in reversed(list(root.lefts)): + if tok.dep in np_left_deps: + left_bound = tok + return left_bound + + def get_right_bound(doc, root): + right_bound = root + for tok in root.rights: + if tok.dep in np_right_deps: + right = get_right_bound(doc, tok) + if list( + filter( + lambda t: is_verb_token(t) or t.dep in stop_deps, + doc[root.i : right.i], + ) + ): + break + else: + right_bound = right + return right_bound + + def get_bounds(doc, root): + return get_left_bound(doc, root), get_right_bound(doc, root) + + doc = doclike.doc # Ensure works on both Doc and Span. + + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + + if not len(doc): + return + + left_labels = [ + "det", + "fixed", + "nmod:poss", + "amod", + "flat", + "goeswith", + "nummod", + "appos", + ] + right_labels = [ + "fixed", + "nmod:poss", + "amod", + "flat", + "goeswith", + "nummod", + "appos", + "nmod", + "det", + ] + stop_labels = ["punct"] + + np_label = doc.vocab.strings.add("NP") + np_left_deps = [doc.vocab.strings.add(label) for label in left_labels] + np_right_deps = [doc.vocab.strings.add(label) for label in right_labels] + stop_deps = [doc.vocab.strings.add(label) for label in stop_labels] + + prev_right = -1 + for token in doclike: + if token.pos in [PROPN, NOUN, PRON]: + left, right = get_bounds(doc, token) + if left.i <= prev_right: + continue + yield left.i, right.i + 1, np_label + prev_right = right.i + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/tests/lang/la/test_noun_chunks.py b/spacy/tests/lang/la/test_noun_chunks.py new file mode 100644 index 000000000..ba8f5658b --- /dev/null +++ b/spacy/tests/lang/la/test_noun_chunks.py @@ -0,0 +1,52 @@ +import pytest +from spacy.tokens import Doc + + +def test_noun_chunks_is_parsed(la_tokenizer): + """Test that noun_chunks raises Value Error for 'la' language if Doc is not parsed. + To check this test, we're constructing a Doc + with a new Vocab here and forcing is_parsed to 'False' + to make sure the noun chunks don't run. + """ + doc = la_tokenizer("Haec est sententia.") + with pytest.raises(ValueError): + list(doc.noun_chunks) + + +LA_NP_TEST_EXAMPLES = [ + ( + "Haec narrantur a poetis de Perseo.", + ["DET", "VERB", "ADP", "NOUN", "ADP", "PROPN", "PUNCT"], + ["nsubj:pass", "ROOT", "case", "obl", "case", "obl", "punct"], + [1, 0, -1, -1, -3, -1, -5], + ["poetis", "Perseo"], + ), + ( + "Perseus autem in sinu matris dormiebat.", + ["NOUN", "ADV", "ADP", "NOUN", "NOUN", "VERB", "PUNCT"], + ["nsubj", "discourse", "case", "obl", "nmod", "ROOT", "punct"], + [5, 4, 3, -1, -1, 0, -1], + ["Perseus", "sinu matris"], + ), +] + + +@pytest.mark.parametrize( + "text,pos,deps,heads,expected_noun_chunks", LA_NP_TEST_EXAMPLES +) +def test_la_noun_chunks(la_tokenizer, text, pos, deps, heads, expected_noun_chunks): + tokens = la_tokenizer(text) + + assert len(heads) == len(pos) + doc = Doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=[head + i for i, head in enumerate(heads)], + deps=deps, + pos=pos, + ) + + noun_chunks = list(doc.noun_chunks) + assert len(noun_chunks) == len(expected_noun_chunks) + for i, np in enumerate(noun_chunks): + assert np.text == expected_noun_chunks[i]