From 5f8a398bb9d12e65069442de28fe1b9036ff119f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 15:09:17 +0100
Subject: [PATCH] Add span_id to Span.char_span, update Doc/Span.char_span docs
 (#12196)

* Add span_id to Span.char_span, update Doc/Span.char_span docs

`Span.char_span(id=)` should be removed in the future.

* Also use Union[int, str] in Doc docstring
---
 spacy/tests/doc/test_span.py | 12 ++++++++++++
 spacy/tokens/doc.pyi         |  1 +
 spacy/tokens/doc.pyx         |  5 +++--
 spacy/tokens/span.pyi        |  2 ++
 spacy/tokens/span.pyx        | 10 ++++++----
 website/docs/api/doc.mdx     | 19 ++++++++++---------
 website/docs/api/span.mdx    |  2 ++
 7 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index d02f305f4..b4631037a 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text):
         assert span.text == text
 
 
+def test_char_span_attributes(doc):
+    label = "LABEL"
+    kb_id = "KB_ID"
+    span_id = "SPAN_ID"
+    span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
+    span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
+    assert span1.text == span2.text
+    assert span1.label_ == span2.label_ == label
+    assert span1.kb_id_ == span2.kb_id_ == kb_id
+    assert span1.id_ == span2.id_ == span_id
+
+
 def test_spans_sent_spans(doc):
     sents = list(doc.sents)
     assert sents[0].start == 0
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index f0cdaee87..9d45960ab 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -108,6 +108,7 @@ class Doc:
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
         alignment_mode: str = ...,
+        span_id: Union[int, str] = ...,
     ) -> Span: ...
     def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
     @property
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 075bc4d15..7dfe0ca9f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -528,9 +528,9 @@ cdef class Doc:
         doc (Doc): The parent document.
         start_idx (int): The index of the first character of the span.
         end_idx (int): The index of the first character after the span.
-        label (uint64 or string): A label to attach to the Span, e.g. for
+        label (Union[int, str]): A label to attach to the Span, e.g. for
             named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a
+        kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a
             named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
             the span.
@@ -539,6 +539,7 @@ cdef class Doc:
             with token boundaries), "contract" (span of all tokens completely
             within the character span), "expand" (span of all tokens at least
             partially covered by the character span). Defaults to "strict".
+        span_id (Union[int, str]): An identifier to associate with the span.
         RETURNS (Span): The newly constructed object.
 
         DOCS: https://spacy.io/api/doc#char_span
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 00226098a..a92f19e20 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -98,7 +98,9 @@ class Span:
         label: Union[int, str] = ...,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
+        id: Union[int, str] = ...,
         alignment_mode: str = ...,
+        span_id: Union[int, str] = ...,
     ) -> Span: ...
     @property
     def conjuncts(self) -> Tuple[Token]: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 2912dd705..cfe1236df 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -639,26 +639,28 @@ cdef class Span:
         else:
             return self.doc[root]
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
         """Create a `Span` object from the slice `span.text[start : end]`.
 
         start (int): The index of the first character of the span.
         end (int): The index of the first character after the span.
-        label (uint64 or string): A label to attach to the Span, e.g. for
+        label (Union[int, str]): A label to attach to the Span, e.g. for
             named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
+        kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
             the span.
+        id (Union[int, str]): Unused.
         alignment_mode (str): How character indices are aligned to token
             boundaries. Options: "strict" (character indices must be aligned
             with token boundaries), "contract" (span of all tokens completely
             within the character span), "expand" (span of all tokens at least
             partially covered by the character span). Defaults to "strict".
+        span_id (Union[int, str]): An identifier to associate with the span.
         RETURNS (Span): The newly constructed object.
         """
         start_idx += self.c.start_char
         end_idx += self.c.start_char
-        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode)
+        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
 
     @property
     def conjuncts(self):
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index a5f3de6be..13c59c4af 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -209,15 +209,16 @@ alignment mode `"strict".
 > assert span.text == "New York"
 > ```
 
-| Name             | Description                                                                                                                                                                                                                                                                  |
-| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`          | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`            | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
-| `label`          | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
-| `kb_id`          | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
-| `vector`         | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
-| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
-| **RETURNS**      | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
+| Name                                     | Description                                                                                                                                                                                                                                                                  |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `start`                                  | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `end`                                    | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
+| `label`                                  | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| `kb_id`                                  | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
+| `vector`                                 | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
+| `alignment_mode`                         | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
+| `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
+| **RETURNS**                              | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 
 ## Doc.set_ents {id="set_ents",tag="method",version="3"}
 
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index a135f5ec9..41422a5b4 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -193,7 +193,9 @@ the character indices don't map to a valid span.
 | `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
 | `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
+| `id`                                            | Unused. ~~Union[int, str]~~                                                                                                                                                                                                                                                  |
 | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
+| `span_id` <Tag variant="new">3.5.1</Tag>        | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
 | **RETURNS**                                     | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 
 ## Span.similarity {id="similarity",tag="method",model="vectors"}