Return Tuple[Span] for all Doc/Span attrs that provide spans (#12288)

* Return Tuple[Span] for all Doc/Span attrs that provide spans

* Update Span types
This commit is contained in:
Adriane Boyd 2023-03-01 16:00:02 +01:00 committed by GitHub
parent df4c069a13
commit da75896ef5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 58 additions and 55 deletions

View File

@ -131,9 +131,9 @@ class Doc:
default: str = ...,
) -> None: ...
@property
def noun_chunks(self) -> Iterator[Span]: ...
def noun_chunks(self) -> Tuple[Span]: ...
@property
def sents(self) -> Iterator[Span]: ...
def sents(self) -> Tuple[Span]: ...
@property
def lang(self) -> int: ...
@property

View File

@ -703,10 +703,10 @@ cdef class Doc:
return self.text
property ents:
"""The named entities in the document. Returns a tuple of named entity
"""The named entities in the document. Returns a list of named entity
`Span` objects, if the entity recognizer has been applied.
RETURNS (tuple): Entities in the document, one `Span` per entity.
RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
DOCS: https://spacy.io/api/doc#ents
"""
@ -864,7 +864,7 @@ cdef class Doc:
NP-level coordination, no prepositional phrases, and no relative
clauses.
YIELDS (Span): Noun chunks in the document.
RETURNS (Tuple[Span]): Noun chunks in the document.
DOCS: https://spacy.io/api/doc#noun_chunks
"""
@ -873,36 +873,35 @@ cdef class Doc:
# Accumulate the result before beginning to iterate over it. This
# prevents the tokenization from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts
# its tokenization changing, so it's okay once we have the Span
# objects. See Issue #375.
# during the iteration.
spans = []
for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label))
for span in spans:
yield span
return tuple(spans)
@property
def sents(self):
"""Iterate over the sentences in the document. Yields sentence `Span`
objects. Sentence spans have no label.
YIELDS (Span): Sentences in the document.
RETURNS (Tuple[Span]): Sentences in the document.
DOCS: https://spacy.io/api/doc#sents
"""
if not self.has_annotation("SENT_START"):
raise ValueError(Errors.E030)
if "sents" in self.user_hooks:
yield from self.user_hooks["sents"](self)
return tuple(self.user_hooks["sents"](self))
else:
start = 0
spans = []
for i in range(1, self.length):
if self.c[i].sent_start == 1:
yield Span(self, start, i)
spans.append(Span(self, start, i))
start = i
if start != self.length:
yield Span(self, start, self.length)
spans.append(Span(self, start, self.length))
return tuple(spans)
@property
def lang(self):

View File

@ -74,6 +74,8 @@ class Span:
@property
def ents(self) -> Tuple[Span]: ...
@property
def sents(self) -> Tuple[Span]: ...
@property
def has_vector(self) -> bool: ...
@property
def vector(self) -> Floats1d: ...
@ -86,7 +88,7 @@ class Span:
@property
def text_with_ws(self) -> str: ...
@property
def noun_chunks(self) -> Iterator[Span]: ...
def noun_chunks(self) -> Tuple[Span]: ...
@property
def root(self) -> Token: ...
def char_span(

View File

@ -461,20 +461,21 @@ cdef class Span:
"""Obtain the sentences that contain this span. If the given span
crosses sentence boundaries, return all sentences it is a part of.
RETURNS (Iterable[Span]): All sentences that the span is a part of.
RETURNS (Tuple[Span]): All sentences that the span is a part of.
DOCS: https://spacy.io/api/span#sents
DOCS: https://spacy.io/api/span#sents
"""
cdef int start
cdef int i
if "sents" in self.doc.user_span_hooks:
yield from self.doc.user_span_hooks["sents"](self)
elif "sents" in self.doc.user_hooks:
return tuple(self.doc.user_span_hooks["sents"](self))
spans = []
if "sents" in self.doc.user_hooks:
for sentence in self.doc.user_hooks["sents"](self.doc):
if sentence.end > self.start:
if sentence.start < self.end or sentence.start == self.start == self.end:
yield sentence
spans.append(sentence)
else:
break
else:
@ -489,12 +490,13 @@ cdef class Span:
# Now, find all the sentences in the span
for i in range(start + 1, self.doc.length):
if self.doc.c[i].sent_start == 1:
yield Span(self.doc, start, i)
spans.append(Span(self.doc, start, i))
start = i
if start >= self.end:
break
if start < self.end:
yield Span(self.doc, start, self.end)
spans.append(Span(self.doc, start, self.end))
return tuple(spans)
@property
@ -502,7 +504,7 @@ cdef class Span:
"""The named entities that fall completely within the span. Returns
a tuple of `Span` objects.
RETURNS (tuple): Entities in the span, one `Span` per entity.
RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
DOCS: https://spacy.io/api/span#ents
"""
@ -517,7 +519,7 @@ cdef class Span:
ents.append(ent)
else:
break
return ents
return tuple(ents)
@property
def has_vector(self):
@ -613,13 +615,15 @@ cdef class Span:
NP-level coordination, no prepositional phrases, and no relative
clauses.
YIELDS (Span): Noun chunks in the span.
RETURNS (Tuple[Span]): Noun chunks in the span.
DOCS: https://spacy.io/api/span#noun_chunks
"""
spans = []
for span in self.doc.noun_chunks:
if span.start >= self.start and span.end <= self.end:
yield span
spans.append(span)
return tuple(spans)
@property
def root(self):

View File

@ -654,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
Iterate over the base noun phrases in the document. Yields base noun-phrase
`Span` objects, if the document has been syntactically parsed. A base noun
phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
nested within it so no NP-level coordination, no prepositional phrases, and no
relative clauses.
Returns a tuple of the base noun phrases in the doc, if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
does not permit other NPs to be nested within it so no NP-level coordination,
no prepositional phrases, and no relative clauses.
To customize the noun chunk iterator in a loaded pipeline, modify
[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@ -675,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
> assert chunks[1].text == "another phrase"
> ```
| Name | Description |
| ---------- | ------------------------------------- |
| **YIELDS** | Noun chunks in the document. ~~Span~~ |
| Name | Description |
| ----------- | -------------------------------------------- |
| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
## Doc.sents {id="sents",tag="property",model="sentences"}
Iterate over the sentences in the document. Sentence spans have no label.
Returns a tuple of the sentences in the document. Sentence spans have no label.
This property is only available when
[sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@ -697,9 +696,9 @@ will raise an error otherwise.
> assert [s.root.text for s in sents] == ["is", "'s"]
> ```
| Name | Description |
| ---------- | ----------------------------------- |
| **YIELDS** | Sentences in the document. ~~Span~~ |
| Name | Description |
| ----------- | ------------------------------------------ |
| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
## Doc.has_vector {id="has_vector",tag="property",model="vectors"}

View File

@ -275,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of
> assert ents[0].text == "Mr. Best"
> ```
| Name | Description |
| ----------- | ----------------------------------------------------------------- |
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
| Name | Description |
| ----------- | ------------------------------------------------------------ |
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
objects, if the document has been syntactically parsed. A base noun phrase, or
"NP chunk", is a noun phrase that does not permit other NPs to be nested within
it so no NP-level coordination, no prepositional phrases, and no relative
clauses.
Returns a tuple of the base noun phrases in the span if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
does not permit other NPs to be nested within it so no NP-level coordination,
no prepositional phrases, and no relative clauses.
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
has not been implemeted for the given language, a `NotImplementedError` is
@ -301,9 +300,9 @@ raised.
> assert chunks[0].text == "another phrase"
> ```
| Name | Description |
| ---------- | --------------------------------- |
| **YIELDS** | Noun chunks in the span. ~~Span~~ |
| Name | Description |
| ----------- | ---------------------------------------- |
| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
## Span.as_doc {id="as_doc",tag="method"}
@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
Returns a generator over the sentences the span belongs to. This property is
only available when [sentence boundaries](/usage/linguistic-features#sbd) have
been set on the document by the `parser`, `senter`, `sentencizer` or some custom
Returns a tuple of the sentences the span belongs to. This property is only
available when [sentence boundaries](/usage/linguistic-features#sbd) have been
set on the document by the `parser`, `senter`, `sentencizer` or some custom
function. It will raise an error otherwise.
If the span happens to cross sentence boundaries, all sentences the span
@ -541,9 +540,9 @@ overlaps with will be returned.
> assert len(span.sents) == 2
> ```
| Name | Description |
| ----------- | -------------------------------------------------------------------------- |
| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
| Name | Description |
| ----------- | ------------------------------------------------------------- |
| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
## Attributes {id="attributes"}