Add alignment_mode argument to Span.char_span() (#12145)

* Add alignment_mode argument to Span.char_span() * Update website * Update spacy/tokens/span.pyx Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Add test Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-10-31 07:57:35 +03:00 · 2023-01-27 20:43:40 +10:00 · 2023-01-27 20:43:40 +10:00 · 774c10fa39
commit 774c10fa39
parent c68e6b8a96
4 changed files with 26 additions and 11 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -367,6 +367,14 @@ def test_spans_by_character(doc):
            span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
        )
    # Span.char_span + alignment mode "contract"
    span2 = doc[0:2].char_span(
        span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
    )
    assert span1.start_char == span2.start_char
    assert span1.end_char == span2.end_char
    assert span2.label_ == "GPE"
 def test_span_to_array(doc):
    span = doc[1:-2]
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@ -98,6 +98,7 @@ class Span:
        label: Union[int, str] = ...,
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
        alignment_mode: str = ...,
    ) -> Span: ...
    @property
    def conjuncts(self) -> Tuple[Token]: ...
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -362,7 +362,7 @@ cdef class Span:
        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
        return result.item()
-    
+
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy
        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
@ -639,7 +639,7 @@ cdef class Span:
        else:
            return self.doc[root]
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"):
        """Create a `Span` object from the slice `span.text[start : end]`.
        start (int): The index of the first character of the span.
@ -649,11 +649,16 @@ cdef class Span:
        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
        alignment_mode (str): How character indices are aligned to token
            boundaries. Options: "strict" (character indices must be aligned
            with token boundaries), "contract" (span of all tokens completely
            within the character span), "expand" (span of all tokens at least
            partially covered by the character span). Defaults to "strict".
        RETURNS (Span): The newly constructed object.
        """
        start_idx += self.c.start_char
        end_idx += self.c.start_char
-        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
+        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode)
    @property
    def conjuncts(self):
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@ -186,14 +186,15 @@ the character indices don't map to a valid span.
 > assert span.text == "New York"
 > ```
-| Name        | Description                                                                               |
+| Name                                            | Description                                                                                                                                                                                                                                                                  |
-| ----------- | ----------------------------------------------------------------------------------------- |
+| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`     | The index of the first character of the span. ~~int~~                                     |
+| `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`       | The index of the last character after the span. ~~int~~                                   |
+| `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
-| `label`     | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~               |
+| `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
-| `kb_id`     | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
+| `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
-| `vector`    | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~            |
+| `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
-| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~                                |
+| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | **RETURNS**                                     | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 ## Span.similarity {id="similarity",tag="method",model="vectors"}