Allow Doc.char_span to snap to token boundaries (#5849)

* Allow Doc.char_span to snap to token boundaries Add a `mode` option to allow `Doc.char_span` to snap to token boundaries. The `mode` options: * `strict`: character offsets must match token boundaries (default, same as before) * `inside`: all tokens completely within the character span * `outside`: all tokens at least partially covered by the character span Add a new helper function `token_by_char` that returns the token corresponding to a character position in the text. Update `token_by_start` and `token_by_end` to use `token_by_char` for more efficient searching. * Remove unused import * Rename mode to alignment_mode Rename `mode` to `alignment_mode` with the options `strict`/`contract`/`expand`. Any unrecognized modes are silently converted to `strict`.
2025-11-11 13:25:43 +03:00 · 2020-08-04 13:36:32 +02:00 · 2020-08-04 13:36:32 +02:00 · c62fd878a3
commit c62fd878a3
parent b841248589
3 changed files with 105 additions and 47 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -167,11 +167,30 @@ def test_spans_are_hashable(en_tokenizer):

 def test_spans_by_character(doc):
    span1 = doc[1:-2]
+
+    # default and specified alignment mode "strict"
    span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
    assert span1.start_char == span2.start_char
    assert span1.end_char == span2.end_char
    assert span2.label_ == "GPE"

+    span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
+    assert span1.start_char == span2.start_char
+    assert span1.end_char == span2.end_char
+    assert span2.label_ == "GPE"
+
+    # alignment mode "contract"
+    span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
+    assert span1.start_char == span2.start_char
+    assert span1.end_char == span2.end_char
+    assert span2.label_ == "GPE"
+
+    # alignment mode "expand"
+    span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
+    assert span1.start_char == span2.start_char
+    assert span1.end_char == span2.end_char
+    assert span2.label_ == "GPE"
+

 def test_span_to_array(doc):
    span = doc[1:-2]
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -352,17 +352,25 @@ cdef class Doc:
    def doc(self):
        return self

-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
-        """Create a `Span` object from the slice `doc.text[start : end]`.
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
+        """Create a `Span` object from the slice
+        `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
+        created.

        doc (Doc): The parent document.
-        start (int): The index of the first character of the span.
-        end (int): The index of the first character after the span.
+        start_idx (int): The index of the first character of the span.
+        end_idx (int): The index of the first character after the span.
        label (uint64 or string): A label to attach to the Span, e.g. for
            named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
+        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a
+            named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
+        alignment_mode (str): How character indices are aligned to token
+            boundaries. Options: "strict" (character indices must be aligned
+            with token boundaries), "contract" (span of all tokens completely
+            within the character span), "expand" (span of all tokens at least
+            partially covered by the character span). Defaults to "strict".
        RETURNS (Span): The newly constructed object.

        DOCS: https://spacy.io/api/doc#char_span
@ -371,12 +379,29 @@ cdef class Doc:
            label = self.vocab.strings.add(label)
        if not isinstance(kb_id, int):
            kb_id = self.vocab.strings.add(kb_id)
-        cdef int start = token_by_start(self.c, self.length, start_idx)
-        if start == -1:
+        if alignment_mode not in ("strict", "contract", "expand"):
+            alignment_mode = "strict"
+        cdef int start = token_by_char(self.c, self.length, start_idx)
+        if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
            return None
-        cdef int end = token_by_end(self.c, self.length, end_idx)
-        if end == -1:
+        # end_idx is exclusive, so find the token at one char before
+        cdef int end = token_by_char(self.c, self.length, end_idx - 1)
+        if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
            return None
+        # Adjust start and end by alignment_mode
+        if alignment_mode == "contract":
+            if self[start].idx < start_idx:
+                start += 1
+            if end_idx < self[end].idx + len(self[end]):
+                end -= 1
+            # if no tokens are completely within the span, return None
+            if end < start:
+                return None
+        elif alignment_mode == "expand":
+            # Don't consider the trailing whitespace to be part of the previous
+            # token
+            if start_idx == self[start].idx + len(self[start]):
+                start += 1
        # Currently we have the token index, we want the range-end index
        end += 1
        cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
@ -1167,23 +1192,35 @@ cdef class Doc:


 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
-    cdef int i
-    for i in range(length):
-        if tokens[i].idx == start_char:
+    cdef int i = token_by_char(tokens, length, start_char)
+    if i >= 0 and tokens[i].idx == start_char:
        return i
    else:
        return -1


 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
-    cdef int i
-    for i in range(length):
-        if tokens[i].idx + tokens[i].lex.length == end_char:
+    # end_char is exclusive, so find the token at one char before
+    cdef int i = token_by_char(tokens, length, end_char - 1)
+    if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
        return i
    else:
        return -1


+cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
+    cdef int start = 0, mid, end = length - 1
+    while start <= end:
+        mid = (start + end) / 2
+        if char_idx < tokens[mid].idx:
+            end = mid - 1
+        elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
+            start = mid + 1
+        else:
+            return mid
+    return -1
+
+
 cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
    cdef TokenC* head
    cdef TokenC* child
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -187,8 +187,9 @@ Remove a previously registered extension.

 ## Doc.char_span {#char_span tag="method" new="2"}

-Create a `Span` object from the slice `doc.text[start:end]`. Returns `None` if
-the character indices don't map to a valid span.
+Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
+`None` if the character indices don't map to a valid span using the default mode
+`"strict".

 > #### Example
 >
@ -199,12 +200,13 @@ the character indices don't map to a valid span.
 > ```

 | Name                                 | Type                                     | Description                                                                                                                                                                                                                                                 |
-| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
-| `start`                              | int                                      | The index of the first character of the span.                         |
-| `end`                                | int                                      | The index of the last character after the span.                       |
+| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `start_idx`                          | int                                      | The index of the first character of the span.                                                                                                                                                                                                               |
+| `end_idx`                            | int                                      | The index of the last character after the span.                                                                                                                                                                                                             |
 | `label`                              | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.                                                                                                                                                                                                     |
 | `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity.                                                                                                                                                                                       |
 | `vector`                             | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                                                                                                                                                                                                       |
+| `mode`                               | `str`                                    | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
 | **RETURNS**                          | `Span`                                   | The newly constructed object or `None`.                                                                                                                                                                                                                     |

 ## Doc.similarity {#similarity tag="method" model="vectors"}
@ -647,7 +649,7 @@ The L2 norm of the document's vector representation.
 ## Attributes {#attributes}

 | Name                                    | Type         | Description                                                                                                                                                                     |
-| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `text`                                  | unicode      | A unicode representation of the document text.                                                                                                                                  |
 | `text_with_ws`                          | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                           |
 | `mem`                                   | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                       |