Add span_id to Span.char_span, update Doc/Span.char_span docs

`Span.char_span(id=)` should be removed in the future.
This commit is contained in:
Adriane Boyd 2023-01-27 11:55:03 +01:00
parent 774c10fa39
commit f97a1afc3e
7 changed files with 34 additions and 13 deletions

View File

@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text):
assert span.text == text
def test_char_span_attributes(doc):
label = "LABEL"
kb_id = "KB_ID"
span_id = "SPAN_ID"
span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
assert span1.text == span2.text
assert span1.label_ == span2.label_ == label
assert span1.kb_id_ == span2.kb_id_ == kb_id
assert span1.id_ == span2.id_ == span_id
def test_spans_sent_spans(doc):
sents = list(doc.sents)
assert sents[0].start == 0

View File

@ -108,6 +108,7 @@ class Doc:
kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ...,
alignment_mode: str = ...,
span_id: Union[int, str] = ...,
) -> Span: ...
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
@property

View File

@ -539,6 +539,7 @@ cdef class Doc:
with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict".
span_id (uint64 or string): An identifier to associate with the span.
RETURNS (Span): The newly constructed object.
DOCS: https://spacy.io/api/doc#char_span

View File

@ -98,7 +98,9 @@ class Span:
label: Union[int, str] = ...,
kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ...,
id: Union[int, str] = ...,
alignment_mode: str = ...,
span_id: Union[int, str] = ...,
) -> Span: ...
@property
def conjuncts(self) -> Tuple[Token]: ...

View File

@ -639,26 +639,28 @@ cdef class Span:
else:
return self.doc[root]
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"):
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
"""Create a `Span` object from the slice `span.text[start : end]`.
start (int): The index of the first character of the span.
end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for
label (Union[int, str]): A label to attach to the Span, e.g. for
named entities.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
id (Union[int, str]): Unused.
alignment_mode (str): How character indices are aligned to token
boundaries. Options: "strict" (character indices must be aligned
with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict".
span_id (Union[int, str]): An identifier to associate with the span.
RETURNS (Span): The newly constructed object.
"""
start_idx += self.c.start_char
end_idx += self.c.start_char
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode)
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
@property
def conjuncts(self):

View File

@ -210,13 +210,14 @@ alignment mode `"strict".
> ```
| Name | Description |
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Doc.set_ents {id="set_ents",tag="method",version="3"}

View File

@ -193,7 +193,9 @@ the character indices don't map to a valid span.
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `id` | Unused. ~~Union[int, str]~~ |
| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Span.similarity {id="similarity",tag="method",model="vectors"}