Add alignment_mode argument to Span.char_span() (#12145)

* Add alignment_mode argument to Span.char_span() * Update website * Update spacy/tokens/span.pyx Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Add test Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-12-30 13:33:16 +03:00 · 2023-01-27 20:43:40 +10:00 · 2023-01-27 20:43:40 +10:00 · 774c10fa39
commit 774c10fa39
parent c68e6b8a96
4 changed files with 26 additions and 11 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -367,6 +367,14 @@ def test_spans_by_character(doc):
            span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
        )

+    # Span.char_span + alignment mode "contract"
+    span2 = doc[0:2].char_span(
+        span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
+    )
+    assert span1.start_char == span2.start_char
+    assert span1.end_char == span2.end_char
+    assert span2.label_ == "GPE"
+

 def test_span_to_array(doc):
    span = doc[1:-2]
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@ -98,6 +98,7 @@ class Span:
        label: Union[int, str] = ...,
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
+        alignment_mode: str = ...,
    ) -> Span: ...
    @property
    def conjuncts(self) -> Tuple[Token]: ...
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -362,7 +362,7 @@ cdef class Span:
        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
        return result.item()
-    
+
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy
        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
@ -639,7 +639,7 @@ cdef class Span:
        else:
            return self.doc[root]

-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"):
        """Create a `Span` object from the slice `span.text[start : end]`.

        start (int): The index of the first character of the span.
@ -649,11 +649,16 @@ cdef class Span:
        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
+        alignment_mode (str): How character indices are aligned to token
+            boundaries. Options: "strict" (character indices must be aligned
+            with token boundaries), "contract" (span of all tokens completely
+            within the character span), "expand" (span of all tokens at least
+            partially covered by the character span). Defaults to "strict".
        RETURNS (Span): The newly constructed object.
        """
        start_idx += self.c.start_char
        end_idx += self.c.start_char
-        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
+        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode)

    @property
    def conjuncts(self):
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@ -186,14 +186,15 @@ the character indices don't map to a valid span.
 > assert span.text == "New York"
 > ```

-| Name        | Description                                                                               |
-| ----------- | ----------------------------------------------------------------------------------------- |
-| `start`     | The index of the first character of the span. ~~int~~                                     |
-| `end`       | The index of the last character after the span. ~~int~~                                   |
-| `label`     | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~               |
-| `kb_id`     | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
-| `vector`    | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~            |
-| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~                                |
+| Name                                            | Description                                                                                                                                                                                                                                                                  |
+| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
+| `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
+| `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
+| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
+| **RETURNS**                                     | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |

 ## Span.similarity {id="similarity",tag="method",model="vectors"}