Return Tuple[Span] for all Doc/Span attrs that provide spans (#12288)

* Return Tuple[Span] for all Doc/Span attrs that provide spans

* Update Span types
This commit is contained in:
Adriane Boyd 2023-03-01 16:00:02 +01:00 committed by GitHub
parent df4c069a13
commit da75896ef5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 58 additions and 55 deletions

View File

@ -131,9 +131,9 @@ class Doc:
default: str = ..., default: str = ...,
) -> None: ... ) -> None: ...
@property @property
def noun_chunks(self) -> Iterator[Span]: ... def noun_chunks(self) -> Tuple[Span]: ...
@property @property
def sents(self) -> Iterator[Span]: ... def sents(self) -> Tuple[Span]: ...
@property @property
def lang(self) -> int: ... def lang(self) -> int: ...
@property @property

View File

@ -703,10 +703,10 @@ cdef class Doc:
return self.text return self.text
property ents: property ents:
"""The named entities in the document. Returns a tuple of named entity """The named entities in the document. Returns a list of named entity
`Span` objects, if the entity recognizer has been applied. `Span` objects, if the entity recognizer has been applied.
RETURNS (tuple): Entities in the document, one `Span` per entity. RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
DOCS: https://spacy.io/api/doc#ents DOCS: https://spacy.io/api/doc#ents
""" """
@ -864,7 +864,7 @@ cdef class Doc:
NP-level coordination, no prepositional phrases, and no relative NP-level coordination, no prepositional phrases, and no relative
clauses. clauses.
YIELDS (Span): Noun chunks in the document. RETURNS (Tuple[Span]): Noun chunks in the document.
DOCS: https://spacy.io/api/doc#noun_chunks DOCS: https://spacy.io/api/doc#noun_chunks
""" """
@ -873,36 +873,35 @@ cdef class Doc:
# Accumulate the result before beginning to iterate over it. This # Accumulate the result before beginning to iterate over it. This
# prevents the tokenization from being changed out from under us # prevents the tokenization from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts # during the iteration.
# its tokenization changing, so it's okay once we have the Span
# objects. See Issue #375.
spans = [] spans = []
for start, end, label in self.noun_chunks_iterator(self): for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label)) spans.append(Span(self, start, end, label=label))
for span in spans: return tuple(spans)
yield span
@property @property
def sents(self): def sents(self):
"""Iterate over the sentences in the document. Yields sentence `Span` """Iterate over the sentences in the document. Yields sentence `Span`
objects. Sentence spans have no label. objects. Sentence spans have no label.
YIELDS (Span): Sentences in the document. RETURNS (Tuple[Span]): Sentences in the document.
DOCS: https://spacy.io/api/doc#sents DOCS: https://spacy.io/api/doc#sents
""" """
if not self.has_annotation("SENT_START"): if not self.has_annotation("SENT_START"):
raise ValueError(Errors.E030) raise ValueError(Errors.E030)
if "sents" in self.user_hooks: if "sents" in self.user_hooks:
yield from self.user_hooks["sents"](self) return tuple(self.user_hooks["sents"](self))
else: else:
start = 0 start = 0
spans = []
for i in range(1, self.length): for i in range(1, self.length):
if self.c[i].sent_start == 1: if self.c[i].sent_start == 1:
yield Span(self, start, i) spans.append(Span(self, start, i))
start = i start = i
if start != self.length: if start != self.length:
yield Span(self, start, self.length) spans.append(Span(self, start, self.length))
return tuple(spans)
@property @property
def lang(self): def lang(self):

View File

@ -74,6 +74,8 @@ class Span:
@property @property
def ents(self) -> Tuple[Span]: ... def ents(self) -> Tuple[Span]: ...
@property @property
def sents(self) -> Tuple[Span]: ...
@property
def has_vector(self) -> bool: ... def has_vector(self) -> bool: ...
@property @property
def vector(self) -> Floats1d: ... def vector(self) -> Floats1d: ...
@ -86,7 +88,7 @@ class Span:
@property @property
def text_with_ws(self) -> str: ... def text_with_ws(self) -> str: ...
@property @property
def noun_chunks(self) -> Iterator[Span]: ... def noun_chunks(self) -> Tuple[Span]: ...
@property @property
def root(self) -> Token: ... def root(self) -> Token: ...
def char_span( def char_span(

View File

@ -461,20 +461,21 @@ cdef class Span:
"""Obtain the sentences that contain this span. If the given span """Obtain the sentences that contain this span. If the given span
crosses sentence boundaries, return all sentences it is a part of. crosses sentence boundaries, return all sentences it is a part of.
RETURNS (Iterable[Span]): All sentences that the span is a part of. RETURNS (Tuple[Span]): All sentences that the span is a part of.
DOCS: https://spacy.io/api/span#sents DOCS: https://spacy.io/api/span#sents
""" """
cdef int start cdef int start
cdef int i cdef int i
if "sents" in self.doc.user_span_hooks: if "sents" in self.doc.user_span_hooks:
yield from self.doc.user_span_hooks["sents"](self) return tuple(self.doc.user_span_hooks["sents"](self))
elif "sents" in self.doc.user_hooks: spans = []
if "sents" in self.doc.user_hooks:
for sentence in self.doc.user_hooks["sents"](self.doc): for sentence in self.doc.user_hooks["sents"](self.doc):
if sentence.end > self.start: if sentence.end > self.start:
if sentence.start < self.end or sentence.start == self.start == self.end: if sentence.start < self.end or sentence.start == self.start == self.end:
yield sentence spans.append(sentence)
else: else:
break break
else: else:
@ -489,12 +490,13 @@ cdef class Span:
# Now, find all the sentences in the span # Now, find all the sentences in the span
for i in range(start + 1, self.doc.length): for i in range(start + 1, self.doc.length):
if self.doc.c[i].sent_start == 1: if self.doc.c[i].sent_start == 1:
yield Span(self.doc, start, i) spans.append(Span(self.doc, start, i))
start = i start = i
if start >= self.end: if start >= self.end:
break break
if start < self.end: if start < self.end:
yield Span(self.doc, start, self.end) spans.append(Span(self.doc, start, self.end))
return tuple(spans)
@property @property
@ -502,7 +504,7 @@ cdef class Span:
"""The named entities that fall completely within the span. Returns """The named entities that fall completely within the span. Returns
a tuple of `Span` objects. a tuple of `Span` objects.
RETURNS (tuple): Entities in the span, one `Span` per entity. RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
DOCS: https://spacy.io/api/span#ents DOCS: https://spacy.io/api/span#ents
""" """
@ -517,7 +519,7 @@ cdef class Span:
ents.append(ent) ents.append(ent)
else: else:
break break
return ents return tuple(ents)
@property @property
def has_vector(self): def has_vector(self):
@ -613,13 +615,15 @@ cdef class Span:
NP-level coordination, no prepositional phrases, and no relative NP-level coordination, no prepositional phrases, and no relative
clauses. clauses.
YIELDS (Span): Noun chunks in the span. RETURNS (Tuple[Span]): Noun chunks in the span.
DOCS: https://spacy.io/api/span#noun_chunks DOCS: https://spacy.io/api/span#noun_chunks
""" """
spans = []
for span in self.doc.noun_chunks: for span in self.doc.noun_chunks:
if span.start >= self.start and span.end <= self.end: if span.start >= self.start and span.end <= self.end:
yield span spans.append(span)
return tuple(spans)
@property @property
def root(self): def root(self):

View File

@ -654,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"} ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
Iterate over the base noun phrases in the document. Yields base noun-phrase Returns a tuple of the base noun phrases in the doc, if the document has been
`Span` objects, if the document has been syntactically parsed. A base noun syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be does not permit other NPs to be nested within it so no NP-level coordination,
nested within it so no NP-level coordination, no prepositional phrases, and no no prepositional phrases, and no relative clauses.
relative clauses.
To customize the noun chunk iterator in a loaded pipeline, modify To customize the noun chunk iterator in a loaded pipeline, modify
[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@ -675,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
> assert chunks[1].text == "another phrase" > assert chunks[1].text == "another phrase"
> ``` > ```
| Name | Description | | Name | Description |
| ---------- | ------------------------------------- | | ----------- | -------------------------------------------- |
| **YIELDS** | Noun chunks in the document. ~~Span~~ | | **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
## Doc.sents {id="sents",tag="property",model="sentences"} ## Doc.sents {id="sents",tag="property",model="sentences"}
Iterate over the sentences in the document. Sentence spans have no label. Returns a tuple of the sentences in the document. Sentence spans have no label.
This property is only available when This property is only available when
[sentence boundaries](/usage/linguistic-features#sbd) have been set on the [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@ -697,9 +696,9 @@ will raise an error otherwise.
> assert [s.root.text for s in sents] == ["is", "'s"] > assert [s.root.text for s in sents] == ["is", "'s"]
> ``` > ```
| Name | Description | | Name | Description |
| ---------- | ----------------------------------- | | ----------- | ------------------------------------------ |
| **YIELDS** | Sentences in the document. ~~Span~~ | | **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
## Doc.has_vector {id="has_vector",tag="property",model="vectors"} ## Doc.has_vector {id="has_vector",tag="property",model="vectors"}

View File

@ -275,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of
> assert ents[0].text == "Mr. Best" > assert ents[0].text == "Mr. Best"
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ----------------------------------------------------------------- | | ----------- | ------------------------------------------------------------ |
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"} ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span` Returns a tuple of the base noun phrases in the span if the document has been
objects, if the document has been syntactically parsed. A base noun phrase, or syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
"NP chunk", is a noun phrase that does not permit other NPs to be nested within does not permit other NPs to be nested within it so no NP-level coordination,
it so no NP-level coordination, no prepositional phrases, and no relative no prepositional phrases, and no relative clauses.
clauses.
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
has not been implemeted for the given language, a `NotImplementedError` is has not been implemeted for the given language, a `NotImplementedError` is
@ -301,9 +300,9 @@ raised.
> assert chunks[0].text == "another phrase" > assert chunks[0].text == "another phrase"
> ``` > ```
| Name | Description | | Name | Description |
| ---------- | --------------------------------- | | ----------- | ---------------------------------------- |
| **YIELDS** | Noun chunks in the span. ~~Span~~ | | **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
## Span.as_doc {id="as_doc",tag="method"} ## Span.as_doc {id="as_doc",tag="method"}
@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"} ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
Returns a generator over the sentences the span belongs to. This property is Returns a tuple of the sentences the span belongs to. This property is only
only available when [sentence boundaries](/usage/linguistic-features#sbd) have available when [sentence boundaries](/usage/linguistic-features#sbd) have been
been set on the document by the `parser`, `senter`, `sentencizer` or some custom set on the document by the `parser`, `senter`, `sentencizer` or some custom
function. It will raise an error otherwise. function. It will raise an error otherwise.
If the span happens to cross sentence boundaries, all sentences the span If the span happens to cross sentence boundaries, all sentences the span
@ -541,9 +540,9 @@ overlaps with will be returned.
> assert len(span.sents) == 2 > assert len(span.sents) == 2
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | -------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------- |
| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ | | **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
## Attributes {id="attributes"} ## Attributes {id="attributes"}