raise NotImplementedError when noun_chunks iterator is not implemented (#6711)

* raise NotImplementedError when noun_chunks iterator is not implemented

* bring back, fix and document span.noun_chunks

* formatting

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
Sofie Van Landeghem 2021-01-17 12:56:05 +01:00 committed by GitHub
parent bf0cdae8d4
commit fed8f48965
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 93 additions and 52 deletions

View File

@ -463,6 +463,8 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
"'{lang}'.")
E895 = ("The 'textcat' component received gold-standard annotations with " E895 = ("The 'textcat' component received gold-standard annotations with "
"multiple labels per document. In spaCy 3 you should use the " "multiple labels per document. In spaCy 3 you should use the "
"'textcat_multilabel' component for this instead. " "'textcat_multilabel' component for this instead. "

View File

@ -86,7 +86,7 @@ def like_num(text):
if text in _num_words: if text in _num_words:
return True return True
# CHeck ordinal number # Check ordinal number
if text in _ordinal_words: if text in _ordinal_words:
return True return True
return False return False

View File

@ -2,6 +2,8 @@ import pytest
import numpy import numpy
import logging import logging
import mock import mock
from spacy.lang.xx import MultiLanguage
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.lexeme import Lexeme from spacy.lexeme import Lexeme
@ -633,6 +635,14 @@ def test_doc_set_ents_invalid_spans(en_tokenizer):
doc.ents = spans doc.ents = spans
def test_doc_noun_chunks_not_implemented():
"""Test that a language without noun_chunk iterator, throws a NotImplementedError"""
text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat."
nlp = MultiLanguage()
doc = nlp(text)
with pytest.raises(NotImplementedError):
chunks = list(doc.noun_chunks)
def test_span_groups(en_tokenizer): def test_span_groups(en_tokenizer):
doc = en_tokenizer("Some text about Colombia and the Czech Republic") doc = en_tokenizer("Some text about Colombia and the Czech Republic")
doc.spans["hi"] = [Span(doc, 3, 4, label="bye")] doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]

View File

@ -1,11 +1,16 @@
import numpy
from spacy.attrs import HEAD, DEP
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
from spacy.lang.en.syntax_iterators import noun_chunks
from spacy.tokens import Doc from spacy.tokens import Doc
import pytest import pytest
@pytest.fixture
def doc(en_vocab):
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
heads = [1, 1, 6, 6, 3, 3, 1]
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
pos = ["PROPN", "VERB", "ADJ", "NOUN", "CCONJ", "NOUN", "NOUN"]
return Doc(en_vocab, words=words, heads=heads, deps=deps, pos=pos)
def test_noun_chunks_is_parsed(en_tokenizer): def test_noun_chunks_is_parsed(en_tokenizer):
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.""" """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
doc = en_tokenizer("This is a sentence") doc = en_tokenizer("This is a sentence")
@ -13,31 +18,27 @@ def test_noun_chunks_is_parsed(en_tokenizer):
list(doc.noun_chunks) list(doc.noun_chunks)
def test_en_noun_chunks_not_nested(en_vocab): def test_en_noun_chunks_not_nested(doc, en_vocab):
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"] """Test that each token only appears in one noun chunk at most"""
heads = [1, 1, 6, 6, 3, 3, 1]
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
doc.from_array(
[HEAD, DEP],
numpy.asarray(
[
[1, nsubj],
[0, root],
[4, amod],
[3, nmod],
[-1, cc],
[-2, conj],
[-5, dobj],
],
dtype="uint64",
),
)
doc.noun_chunks_iterator = noun_chunks
word_occurred = {} word_occurred = {}
for chunk in doc.noun_chunks: chunks = list(doc.noun_chunks)
assert len(chunks) > 1
for chunk in chunks:
for word in chunk: for word in chunk:
word_occurred.setdefault(word.text, 0) word_occurred.setdefault(word.text, 0)
word_occurred[word.text] += 1 word_occurred[word.text] += 1
assert len(word_occurred) > 0
for word, freq in word_occurred.items(): for word, freq in word_occurred.items():
assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks]) assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
def test_noun_chunks_span(doc, en_tokenizer):
"""Test that the span.noun_chunks property works correctly"""
doc_chunks = list(doc.noun_chunks)
span = doc[0:3]
span_chunks = list(span.noun_chunks)
assert 0 < len(span_chunks) < len(doc_chunks)
for chunk in span_chunks:
assert chunk in doc_chunks
assert chunk.start >= 0
assert chunk.end <= 3

View File

@ -81,7 +81,8 @@ def test_issue3199():
""" """
words = ["This", "is", "a", "sentence"] words = ["This", "is", "a", "sentence"]
doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words)) doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
assert list(doc[0:3].noun_chunks) == [] with pytest.raises(NotImplementedError):
list(doc[0:3].noun_chunks)
def test_issue3209(): def test_issue3209():

View File

@ -816,8 +816,10 @@ cdef class Doc:
@property @property
def noun_chunks(self): def noun_chunks(self):
"""Iterate over the base noun phrases in the document. Yields base """Iterate over the base noun phrases in the document. Yields base
noun-phrase #[code Span] objects, if the document has been noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
syntactically parsed. A base noun phrase, or "NP chunk", is a noun Raises a NotImplementedError otherwise.
A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it so no phrase that does not permit other NPs to be nested within it so no
NP-level coordination, no prepositional phrases, and no relative NP-level coordination, no prepositional phrases, and no relative
clauses. clauses.
@ -826,14 +828,15 @@ cdef class Doc:
DOCS: https://nightly.spacy.io/api/doc#noun_chunks DOCS: https://nightly.spacy.io/api/doc#noun_chunks
""" """
if self.noun_chunks_iterator is None:
raise NotImplementedError(Errors.E894.format(lang=self.vocab.lang))
# Accumulate the result before beginning to iterate over it. This # Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us # prevents the tokenization from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts # during the iteration. The tricky thing here is that Span accepts
# its tokenisation changing, so it's okay once we have the Span # its tokenization changing, so it's okay once we have the Span
# objects. See Issue #375. # objects. See Issue #375.
spans = [] spans = []
if self.noun_chunks_iterator is not None:
for start, end, label in self.noun_chunks_iterator(self): for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label)) spans.append(Span(self, start, end, label=label))
for span in spans: for span in spans:

View File

@ -487,29 +487,24 @@ cdef class Span:
""" """
return "".join([t.text_with_ws for t in self]) return "".join([t.text_with_ws for t in self])
@property @property
def noun_chunks(self): def noun_chunks(self):
"""Yields base noun-phrase `Span` objects, if the document has been """Iterate over the base noun phrases in the span. Yields base
syntactically parsed. A base noun phrase, or "NP chunk", is a noun noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
Raises a NotImplementedError otherwise.
A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it so no phrase that does not permit other NPs to be nested within it so no
NP-level coordination, no prepositional phrases, and no relative NP-level coordination, no prepositional phrases, and no relative
clauses. clauses.
YIELDS (Span): Base noun-phrase `Span` objects. YIELDS (Span): Noun chunks in the span.
DOCS: https://nightly.spacy.io/api/span#noun_chunks DOCS: https://nightly.spacy.io/api/span#noun_chunks
""" """
# Accumulate the result before beginning to iterate over it. This for span in self.doc.noun_chunks:
# prevents the tokenisation from being changed out from under us if span.start >= self.start and span.end <= self.end:
# during the iteration. The tricky thing here is that Span accepts
# its tokenisation changing, so it's okay once we have the Span
# objects. See Issue #375
spans = []
cdef attr_t label
if self.doc.noun_chunks_iterator is not None:
for start, end, label in self.doc.noun_chunks_iterator(self):
spans.append(Span(self.doc, start, end, label=label))
for span in spans:
yield span yield span
@property @property

View File

@ -616,11 +616,15 @@ phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
nested within it so no NP-level coordination, no prepositional phrases, and no nested within it so no NP-level coordination, no prepositional phrases, and no
relative clauses. relative clauses.
If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
not been implemeted for the given language, a `NotImplementedError` is raised.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp("A phrase with another phrase occurs.") > doc = nlp("A phrase with another phrase occurs.")
> chunks = list(doc.noun_chunks) > chunks = list(doc.noun_chunks)
> assert len(chunks) == 2
> assert chunks[0].text == "A phrase" > assert chunks[0].text == "A phrase"
> assert chunks[1].text == "another phrase" > assert chunks[1].text == "another phrase"
> ``` > ```

View File

@ -274,6 +274,31 @@ if the entity recognizer has been applied.
| ----------- | ----------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------- |
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
## Span.noun_chunks {#noun_chunks tag="property" model="parser"}
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
objects, if the document has been syntactically parsed. A base noun phrase, or
"NP chunk", is a noun phrase that does not permit other NPs to be nested within
it so no NP-level coordination, no prepositional phrases, and no relative
clauses.
If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
not been implemeted for the given language, a `NotImplementedError` is raised.
> #### Example
>
> ```python
> doc = nlp("A phrase with another phrase occurs.")
> span = doc[3:5]
> chunks = list(span.noun_chunks)
> assert len(chunks) == 1
> assert chunks[0].text == "another phrase"
> ```
| Name | Description |
| ---------- | --------------------------------- |
| **YIELDS** | Noun chunks in the span. ~~Span~~ |
## Span.as_doc {#as_doc tag="method"} ## Span.as_doc {#as_doc tag="method"}
Create a new `Doc` object corresponding to the `Span`, with a copy of the data. Create a new `Doc` object corresponding to the `Span`, with a copy of the data.

View File

@ -221,7 +221,7 @@ Noun chunks are "base noun phrases" flat phrases that have a noun as their
head. You can think of noun chunks as a noun plus the words describing the noun head. You can think of noun chunks as a noun plus the words describing the noun
for example, "the lavish green grass" or "the worlds largest tech fund". To for example, "the lavish green grass" or "the worlds largest tech fund". To
get the noun chunks in a document, simply iterate over get the noun chunks in a document, simply iterate over
[`Doc.noun_chunks`](/api/doc#noun_chunks) [`Doc.noun_chunks`](/api/doc#noun_chunks).
```python ```python
### {executable="true"} ### {executable="true"}