mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
raise NotImplementedError when noun_chunks iterator is not implemented (#6711)
* raise NotImplementedError when noun_chunks iterator is not implemented * bring back, fix and document span.noun_chunks * formatting Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
bf0cdae8d4
commit
fed8f48965
|
@ -463,6 +463,8 @@ class Errors:
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
|
||||||
|
"'{lang}'.")
|
||||||
E895 = ("The 'textcat' component received gold-standard annotations with "
|
E895 = ("The 'textcat' component received gold-standard annotations with "
|
||||||
"multiple labels per document. In spaCy 3 you should use the "
|
"multiple labels per document. In spaCy 3 you should use the "
|
||||||
"'textcat_multilabel' component for this instead. "
|
"'textcat_multilabel' component for this instead. "
|
||||||
|
|
|
@ -86,7 +86,7 @@ def like_num(text):
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# CHeck ordinal number
|
# Check ordinal number
|
||||||
if text in _ordinal_words:
|
if text in _ordinal_words:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -2,6 +2,8 @@ import pytest
|
||||||
import numpy
|
import numpy
|
||||||
import logging
|
import logging
|
||||||
import mock
|
import mock
|
||||||
|
|
||||||
|
from spacy.lang.xx import MultiLanguage
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.lexeme import Lexeme
|
from spacy.lexeme import Lexeme
|
||||||
|
@ -633,6 +635,14 @@ def test_doc_set_ents_invalid_spans(en_tokenizer):
|
||||||
doc.ents = spans
|
doc.ents = spans
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_noun_chunks_not_implemented():
|
||||||
|
"""Test that a language without noun_chunk iterator, throws a NotImplementedError"""
|
||||||
|
text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat."
|
||||||
|
nlp = MultiLanguage()
|
||||||
|
doc = nlp(text)
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
chunks = list(doc.noun_chunks)
|
||||||
|
|
||||||
def test_span_groups(en_tokenizer):
|
def test_span_groups(en_tokenizer):
|
||||||
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
|
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
|
||||||
doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]
|
doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]
|
||||||
|
|
|
@ -1,11 +1,16 @@
|
||||||
import numpy
|
|
||||||
from spacy.attrs import HEAD, DEP
|
|
||||||
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
|
|
||||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def doc(en_vocab):
|
||||||
|
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
|
||||||
|
heads = [1, 1, 6, 6, 3, 3, 1]
|
||||||
|
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
|
||||||
|
pos = ["PROPN", "VERB", "ADJ", "NOUN", "CCONJ", "NOUN", "NOUN"]
|
||||||
|
return Doc(en_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
|
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
|
||||||
doc = en_tokenizer("This is a sentence")
|
doc = en_tokenizer("This is a sentence")
|
||||||
|
@ -13,31 +18,27 @@ def test_noun_chunks_is_parsed(en_tokenizer):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
||||||
|
|
||||||
def test_en_noun_chunks_not_nested(en_vocab):
|
def test_en_noun_chunks_not_nested(doc, en_vocab):
|
||||||
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
|
"""Test that each token only appears in one noun chunk at most"""
|
||||||
heads = [1, 1, 6, 6, 3, 3, 1]
|
|
||||||
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
|
|
||||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
|
||||||
doc.from_array(
|
|
||||||
[HEAD, DEP],
|
|
||||||
numpy.asarray(
|
|
||||||
[
|
|
||||||
[1, nsubj],
|
|
||||||
[0, root],
|
|
||||||
[4, amod],
|
|
||||||
[3, nmod],
|
|
||||||
[-1, cc],
|
|
||||||
[-2, conj],
|
|
||||||
[-5, dobj],
|
|
||||||
],
|
|
||||||
dtype="uint64",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
doc.noun_chunks_iterator = noun_chunks
|
|
||||||
word_occurred = {}
|
word_occurred = {}
|
||||||
for chunk in doc.noun_chunks:
|
chunks = list(doc.noun_chunks)
|
||||||
|
assert len(chunks) > 1
|
||||||
|
for chunk in chunks:
|
||||||
for word in chunk:
|
for word in chunk:
|
||||||
word_occurred.setdefault(word.text, 0)
|
word_occurred.setdefault(word.text, 0)
|
||||||
word_occurred[word.text] += 1
|
word_occurred[word.text] += 1
|
||||||
|
assert len(word_occurred) > 0
|
||||||
for word, freq in word_occurred.items():
|
for word, freq in word_occurred.items():
|
||||||
assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
|
assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
|
||||||
|
|
||||||
|
|
||||||
|
def test_noun_chunks_span(doc, en_tokenizer):
|
||||||
|
"""Test that the span.noun_chunks property works correctly"""
|
||||||
|
doc_chunks = list(doc.noun_chunks)
|
||||||
|
span = doc[0:3]
|
||||||
|
span_chunks = list(span.noun_chunks)
|
||||||
|
assert 0 < len(span_chunks) < len(doc_chunks)
|
||||||
|
for chunk in span_chunks:
|
||||||
|
assert chunk in doc_chunks
|
||||||
|
assert chunk.start >= 0
|
||||||
|
assert chunk.end <= 3
|
||||||
|
|
|
@ -81,7 +81,8 @@ def test_issue3199():
|
||||||
"""
|
"""
|
||||||
words = ["This", "is", "a", "sentence"]
|
words = ["This", "is", "a", "sentence"]
|
||||||
doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
|
doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
|
||||||
assert list(doc[0:3].noun_chunks) == []
|
with pytest.raises(NotImplementedError):
|
||||||
|
list(doc[0:3].noun_chunks)
|
||||||
|
|
||||||
|
|
||||||
def test_issue3209():
|
def test_issue3209():
|
||||||
|
|
|
@ -816,8 +816,10 @@ cdef class Doc:
|
||||||
@property
|
@property
|
||||||
def noun_chunks(self):
|
def noun_chunks(self):
|
||||||
"""Iterate over the base noun phrases in the document. Yields base
|
"""Iterate over the base noun phrases in the document. Yields base
|
||||||
noun-phrase #[code Span] objects, if the document has been
|
noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
|
||||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
Raises a NotImplementedError otherwise.
|
||||||
|
|
||||||
|
A base noun phrase, or "NP chunk", is a noun
|
||||||
phrase that does not permit other NPs to be nested within it – so no
|
phrase that does not permit other NPs to be nested within it – so no
|
||||||
NP-level coordination, no prepositional phrases, and no relative
|
NP-level coordination, no prepositional phrases, and no relative
|
||||||
clauses.
|
clauses.
|
||||||
|
@ -826,14 +828,15 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/doc#noun_chunks
|
DOCS: https://nightly.spacy.io/api/doc#noun_chunks
|
||||||
"""
|
"""
|
||||||
|
if self.noun_chunks_iterator is None:
|
||||||
|
raise NotImplementedError(Errors.E894.format(lang=self.vocab.lang))
|
||||||
|
|
||||||
# Accumulate the result before beginning to iterate over it. This
|
# Accumulate the result before beginning to iterate over it. This
|
||||||
# prevents the tokenisation from being changed out from under us
|
# prevents the tokenization from being changed out from under us
|
||||||
# during the iteration. The tricky thing here is that Span accepts
|
# during the iteration. The tricky thing here is that Span accepts
|
||||||
# its tokenisation changing, so it's okay once we have the Span
|
# its tokenization changing, so it's okay once we have the Span
|
||||||
# objects. See Issue #375.
|
# objects. See Issue #375.
|
||||||
spans = []
|
spans = []
|
||||||
if self.noun_chunks_iterator is not None:
|
|
||||||
for start, end, label in self.noun_chunks_iterator(self):
|
for start, end, label in self.noun_chunks_iterator(self):
|
||||||
spans.append(Span(self, start, end, label=label))
|
spans.append(Span(self, start, end, label=label))
|
||||||
for span in spans:
|
for span in spans:
|
||||||
|
|
|
@ -487,29 +487,24 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
return "".join([t.text_with_ws for t in self])
|
return "".join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def noun_chunks(self):
|
def noun_chunks(self):
|
||||||
"""Yields base noun-phrase `Span` objects, if the document has been
|
"""Iterate over the base noun phrases in the span. Yields base
|
||||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
|
||||||
|
Raises a NotImplementedError otherwise.
|
||||||
|
|
||||||
|
A base noun phrase, or "NP chunk", is a noun
|
||||||
phrase that does not permit other NPs to be nested within it – so no
|
phrase that does not permit other NPs to be nested within it – so no
|
||||||
NP-level coordination, no prepositional phrases, and no relative
|
NP-level coordination, no prepositional phrases, and no relative
|
||||||
clauses.
|
clauses.
|
||||||
|
|
||||||
YIELDS (Span): Base noun-phrase `Span` objects.
|
YIELDS (Span): Noun chunks in the span.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/span#noun_chunks
|
DOCS: https://nightly.spacy.io/api/span#noun_chunks
|
||||||
"""
|
"""
|
||||||
# Accumulate the result before beginning to iterate over it. This
|
for span in self.doc.noun_chunks:
|
||||||
# prevents the tokenisation from being changed out from under us
|
if span.start >= self.start and span.end <= self.end:
|
||||||
# during the iteration. The tricky thing here is that Span accepts
|
|
||||||
# its tokenisation changing, so it's okay once we have the Span
|
|
||||||
# objects. See Issue #375
|
|
||||||
spans = []
|
|
||||||
cdef attr_t label
|
|
||||||
if self.doc.noun_chunks_iterator is not None:
|
|
||||||
for start, end, label in self.doc.noun_chunks_iterator(self):
|
|
||||||
spans.append(Span(self.doc, start, end, label=label))
|
|
||||||
for span in spans:
|
|
||||||
yield span
|
yield span
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -616,11 +616,15 @@ phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
|
||||||
nested within it – so no NP-level coordination, no prepositional phrases, and no
|
nested within it – so no NP-level coordination, no prepositional phrases, and no
|
||||||
relative clauses.
|
relative clauses.
|
||||||
|
|
||||||
|
If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
|
||||||
|
not been implemeted for the given language, a `NotImplementedError` is raised.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> doc = nlp("A phrase with another phrase occurs.")
|
> doc = nlp("A phrase with another phrase occurs.")
|
||||||
> chunks = list(doc.noun_chunks)
|
> chunks = list(doc.noun_chunks)
|
||||||
|
> assert len(chunks) == 2
|
||||||
> assert chunks[0].text == "A phrase"
|
> assert chunks[0].text == "A phrase"
|
||||||
> assert chunks[1].text == "another phrase"
|
> assert chunks[1].text == "another phrase"
|
||||||
> ```
|
> ```
|
||||||
|
|
|
@ -274,6 +274,31 @@ if the entity recognizer has been applied.
|
||||||
| ----------- | ----------------------------------------------------------------- |
|
| ----------- | ----------------------------------------------------------------- |
|
||||||
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
|
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
|
||||||
|
|
||||||
|
## Span.noun_chunks {#noun_chunks tag="property" model="parser"}
|
||||||
|
|
||||||
|
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
|
||||||
|
objects, if the document has been syntactically parsed. A base noun phrase, or
|
||||||
|
"NP chunk", is a noun phrase that does not permit other NPs to be nested within
|
||||||
|
it – so no NP-level coordination, no prepositional phrases, and no relative
|
||||||
|
clauses.
|
||||||
|
|
||||||
|
If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
|
||||||
|
not been implemeted for the given language, a `NotImplementedError` is raised.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("A phrase with another phrase occurs.")
|
||||||
|
> span = doc[3:5]
|
||||||
|
> chunks = list(span.noun_chunks)
|
||||||
|
> assert len(chunks) == 1
|
||||||
|
> assert chunks[0].text == "another phrase"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------- | --------------------------------- |
|
||||||
|
| **YIELDS** | Noun chunks in the span. ~~Span~~ |
|
||||||
|
|
||||||
## Span.as_doc {#as_doc tag="method"}
|
## Span.as_doc {#as_doc tag="method"}
|
||||||
|
|
||||||
Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
|
Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
|
||||||
|
|
|
@ -221,7 +221,7 @@ Noun chunks are "base noun phrases" – flat phrases that have a noun as their
|
||||||
head. You can think of noun chunks as a noun plus the words describing the noun
|
head. You can think of noun chunks as a noun plus the words describing the noun
|
||||||
– for example, "the lavish green grass" or "the world’s largest tech fund". To
|
– for example, "the lavish green grass" or "the world’s largest tech fund". To
|
||||||
get the noun chunks in a document, simply iterate over
|
get the noun chunks in a document, simply iterate over
|
||||||
[`Doc.noun_chunks`](/api/doc#noun_chunks)
|
[`Doc.noun_chunks`](/api/doc#noun_chunks).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user