From 5f8a398bb9d12e65069442de28fe1b9036ff119f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 27 Jan 2023 15:09:17 +0100 Subject: [PATCH] Add span_id to Span.char_span, update Doc/Span.char_span docs (#12196) * Add span_id to Span.char_span, update Doc/Span.char_span docs `Span.char_span(id=)` should be removed in the future. * Also use Union[int, str] in Doc docstring --- spacy/tests/doc/test_span.py | 12 ++++++++++++ spacy/tokens/doc.pyi | 1 + spacy/tokens/doc.pyx | 5 +++-- spacy/tokens/span.pyi | 2 ++ spacy/tokens/span.pyx | 10 ++++++---- website/docs/api/doc.mdx | 19 ++++++++++--------- website/docs/api/span.mdx | 2 ++ 7 files changed, 36 insertions(+), 15 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index d02f305f4..b4631037a 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text): assert span.text == text +def test_char_span_attributes(doc): + label = "LABEL" + kb_id = "KB_ID" + span_id = "SPAN_ID" + span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id) + span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id) + assert span1.text == span2.text + assert span1.label_ == span2.label_ == label + assert span1.kb_id_ == span2.kb_id_ == kb_id + assert span1.id_ == span2.id_ == span_id + + def test_spans_sent_spans(doc): sents = list(doc.sents) assert sents[0].start == 0 diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index f0cdaee87..9d45960ab 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -108,6 +108,7 @@ class Doc: kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., + span_id: Union[int, str] = ..., ) -> Span: ... def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... @property diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 075bc4d15..7dfe0ca9f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -528,9 +528,9 @@ cdef class Doc: doc (Doc): The parent document. start_idx (int): The index of the first character of the span. end_idx (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for + label (Union[int, str]): A label to attach to the Span, e.g. for named entities. - kb_id (uint64 or string): An ID from a KB to capture the meaning of a + kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. @@ -539,6 +539,7 @@ cdef class Doc: with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". + span_id (Union[int, str]): An identifier to associate with the span. RETURNS (Span): The newly constructed object. DOCS: https://spacy.io/api/doc#char_span diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 00226098a..a92f19e20 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -98,7 +98,9 @@ class Span: label: Union[int, str] = ..., kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., + id: Union[int, str] = ..., alignment_mode: str = ..., + span_id: Union[int, str] = ..., ) -> Span: ... @property def conjuncts(self) -> Tuple[Token]: ... diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 2912dd705..cfe1236df 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -639,26 +639,28 @@ cdef class Span: else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"): + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `span.text[start : end]`. start (int): The index of the first character of the span. end (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for + label (Union[int, str]): A label to attach to the Span, e.g. for named entities. - kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. + kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + id (Union[int, str]): Unused. alignment_mode (str): How character indices are aligned to token boundaries. Options: "strict" (character indices must be aligned with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". + span_id (Union[int, str]): An identifier to associate with the span. RETURNS (Span): The newly constructed object. """ start_idx += self.c.start_char end_idx += self.c.start_char - return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode) + return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id) @property def conjuncts(self): diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index a5f3de6be..13c59c4af 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -209,15 +209,16 @@ alignment mode `"strict". > assert span.text == "New York" > ``` -| Name | Description | -| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | -| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | -| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | -| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | -| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +| Name | Description | +| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `start` | The index of the first character of the span. ~~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | +| `span_id` 3.3.1 | An identifier to associate with the span. ~~Union[int, str]~~ | +| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Doc.set_ents {id="set_ents",tag="method",version="3"} diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index a135f5ec9..41422a5b4 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -193,7 +193,9 @@ the character indices don't map to a valid span. | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `id` | Unused. ~~Union[int, str]~~ | | `alignment_mode` 3.5.1 | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | +| `span_id` 3.5.1 | An identifier to associate with the span. ~~Union[int, str]~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Span.similarity {id="similarity",tag="method",model="vectors"}