diff --git a/spacy/errors.py b/spacy/errors.py index 8cbcbe6d9..f3cdbcee7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -463,6 +463,8 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E894 = ("The 'noun_chunks' syntax iterator is not implemented for language " + "'{lang}'.") E895 = ("The 'textcat' component received gold-standard annotations with " "multiple labels per document. In spaCy 3 you should use the " "'textcat_multilabel' component for this instead. " diff --git a/spacy/lang/he/lex_attrs.py b/spacy/lang/he/lex_attrs.py index 2953e7592..2cd6f630e 100644 --- a/spacy/lang/he/lex_attrs.py +++ b/spacy/lang/he/lex_attrs.py @@ -86,7 +86,7 @@ def like_num(text): if text in _num_words: return True - # CHeck ordinal number + # Check ordinal number if text in _ordinal_words: return True return False diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 74b8d825e..ea95ca772 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -2,6 +2,8 @@ import pytest import numpy import logging import mock + +from spacy.lang.xx import MultiLanguage from spacy.tokens import Doc, Span from spacy.vocab import Vocab from spacy.lexeme import Lexeme @@ -633,6 +635,14 @@ def test_doc_set_ents_invalid_spans(en_tokenizer): doc.ents = spans +def test_doc_noun_chunks_not_implemented(): + """Test that a language without noun_chunk iterator, throws a NotImplementedError""" + text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat." + nlp = MultiLanguage() + doc = nlp(text) + with pytest.raises(NotImplementedError): + chunks = list(doc.noun_chunks) + def test_span_groups(en_tokenizer): doc = en_tokenizer("Some text about Colombia and the Czech Republic") doc.spans["hi"] = [Span(doc, 3, 4, label="bye")] diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index 540f3ed84..0c54ffbb4 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -1,11 +1,16 @@ -import numpy -from spacy.attrs import HEAD, DEP -from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root -from spacy.lang.en.syntax_iterators import noun_chunks from spacy.tokens import Doc import pytest +@pytest.fixture +def doc(en_vocab): + words = ["Peter", "has", "chronic", "command", "and", "control", "issues"] + heads = [1, 1, 6, 6, 3, 3, 1] + deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"] + pos = ["PROPN", "VERB", "ADJ", "NOUN", "CCONJ", "NOUN", "NOUN"] + return Doc(en_vocab, words=words, heads=heads, deps=deps, pos=pos) + + def test_noun_chunks_is_parsed(en_tokenizer): """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.""" doc = en_tokenizer("This is a sentence") @@ -13,31 +18,27 @@ def test_noun_chunks_is_parsed(en_tokenizer): list(doc.noun_chunks) -def test_en_noun_chunks_not_nested(en_vocab): - words = ["Peter", "has", "chronic", "command", "and", "control", "issues"] - heads = [1, 1, 6, 6, 3, 3, 1] - deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"] - doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - doc.from_array( - [HEAD, DEP], - numpy.asarray( - [ - [1, nsubj], - [0, root], - [4, amod], - [3, nmod], - [-1, cc], - [-2, conj], - [-5, dobj], - ], - dtype="uint64", - ), - ) - doc.noun_chunks_iterator = noun_chunks +def test_en_noun_chunks_not_nested(doc, en_vocab): + """Test that each token only appears in one noun chunk at most""" word_occurred = {} - for chunk in doc.noun_chunks: + chunks = list(doc.noun_chunks) + assert len(chunks) > 1 + for chunk in chunks: for word in chunk: word_occurred.setdefault(word.text, 0) word_occurred[word.text] += 1 + assert len(word_occurred) > 0 for word, freq in word_occurred.items(): assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks]) + + +def test_noun_chunks_span(doc, en_tokenizer): + """Test that the span.noun_chunks property works correctly""" + doc_chunks = list(doc.noun_chunks) + span = doc[0:3] + span_chunks = list(span.noun_chunks) + assert 0 < len(span_chunks) < len(doc_chunks) + for chunk in span_chunks: + assert chunk in doc_chunks + assert chunk.start >= 0 + assert chunk.end <= 3 diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 01f58ae77..362ba67ae 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -81,7 +81,8 @@ def test_issue3199(): """ words = ["This", "is", "a", "sentence"] doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words)) - assert list(doc[0:3].noun_chunks) == [] + with pytest.raises(NotImplementedError): + list(doc[0:3].noun_chunks) def test_issue3209(): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 32a81f515..32f8c91fa 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -816,8 +816,10 @@ cdef class Doc: @property def noun_chunks(self): """Iterate over the base noun phrases in the document. Yields base - noun-phrase #[code Span] objects, if the document has been - syntactically parsed. A base noun phrase, or "NP chunk", is a noun + noun-phrase #[code Span] objects, if the language has a noun chunk iterator. + Raises a NotImplementedError otherwise. + + A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. @@ -826,16 +828,17 @@ cdef class Doc: DOCS: https://nightly.spacy.io/api/doc#noun_chunks """ + if self.noun_chunks_iterator is None: + raise NotImplementedError(Errors.E894.format(lang=self.vocab.lang)) # Accumulate the result before beginning to iterate over it. This - # prevents the tokenisation from being changed out from under us + # prevents the tokenization from being changed out from under us # during the iteration. The tricky thing here is that Span accepts - # its tokenisation changing, so it's okay once we have the Span + # its tokenization changing, so it's okay once we have the Span # objects. See Issue #375. spans = [] - if self.noun_chunks_iterator is not None: - for start, end, label in self.noun_chunks_iterator(self): - spans.append(Span(self, start, end, label=label)) + for start, end, label in self.noun_chunks_iterator(self): + spans.append(Span(self, start, end, label=label)) for span in spans: yield span diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 8643816a1..4e6fb84f5 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -487,30 +487,25 @@ cdef class Span: """ return "".join([t.text_with_ws for t in self]) + @property def noun_chunks(self): - """Yields base noun-phrase `Span` objects, if the document has been - syntactically parsed. A base noun phrase, or "NP chunk", is a noun + """Iterate over the base noun phrases in the span. Yields base + noun-phrase #[code Span] objects, if the language has a noun chunk iterator. + Raises a NotImplementedError otherwise. + + A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. - YIELDS (Span): Base noun-phrase `Span` objects. + YIELDS (Span): Noun chunks in the span. DOCS: https://nightly.spacy.io/api/span#noun_chunks """ - # Accumulate the result before beginning to iterate over it. This - # prevents the tokenisation from being changed out from under us - # during the iteration. The tricky thing here is that Span accepts - # its tokenisation changing, so it's okay once we have the Span - # objects. See Issue #375 - spans = [] - cdef attr_t label - if self.doc.noun_chunks_iterator is not None: - for start, end, label in self.doc.noun_chunks_iterator(self): - spans.append(Span(self.doc, start, end, label=label)) - for span in spans: - yield span + for span in self.doc.noun_chunks: + if span.start >= self.start and span.end <= self.end: + yield span @property def root(self): diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index f3521dae3..a0b4c29bb 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -616,11 +616,15 @@ phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. +If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has +not been implemeted for the given language, a `NotImplementedError` is raised. + > #### Example > > ```python > doc = nlp("A phrase with another phrase occurs.") > chunks = list(doc.noun_chunks) +> assert len(chunks) == 2 > assert chunks[0].text == "A phrase" > assert chunks[1].text == "another phrase" > ``` diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 7fa1aaa38..37d18c62e 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -187,7 +187,7 @@ the character indices don't map to a valid span. | Name | Description | | ------------------------------------ | ----------------------------------------------------------------------------------------- | | `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `kb_id` 2.2 | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | @@ -274,6 +274,31 @@ if the entity recognizer has been applied. | ----------- | ----------------------------------------------------------------- | | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | +## Span.noun_chunks {#noun_chunks tag="property" model="parser"} + +Iterate over the base noun phrases in the span. Yields base noun-phrase `Span` +objects, if the document has been syntactically parsed. A base noun phrase, or +"NP chunk", is a noun phrase that does not permit other NPs to be nested within +it – so no NP-level coordination, no prepositional phrases, and no relative +clauses. + +If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has +not been implemeted for the given language, a `NotImplementedError` is raised. + +> #### Example +> +> ```python +> doc = nlp("A phrase with another phrase occurs.") +> span = doc[3:5] +> chunks = list(span.noun_chunks) +> assert len(chunks) == 1 +> assert chunks[0].text == "another phrase" +> ``` + +| Name | Description | +| ---------- | --------------------------------- | +| **YIELDS** | Noun chunks in the span. ~~Span~~ | + ## Span.as_doc {#as_doc tag="method"} Create a new `Doc` object corresponding to the `Span`, with a copy of the data. diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 9a2b538d1..80a8eab1b 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -221,7 +221,7 @@ Noun chunks are "base noun phrases" – flat phrases that have a noun as their head. You can think of noun chunks as a noun plus the words describing the noun – for example, "the lavish green grass" or "the world’s largest tech fund". To get the noun chunks in a document, simply iterate over -[`Doc.noun_chunks`](/api/doc#noun_chunks) +[`Doc.noun_chunks`](/api/doc#noun_chunks). ```python ### {executable="true"}