Add alignment mode error and fix Doc.char_span docs (#6820)

* Raise an error on an unrecognized alignment mode rather than defaulting to `strict` * Fix the `Doc.char_span` API doc alignment mode details
2025-12-23 10:03:15 +03:00 · 2021-01-27 13:40:42 +01:00 · 2021-01-27 13:40:42 +01:00 · 4096a79de7
commit 4096a79de7
parent d5ef245bb1
4 changed files with 19 additions and 11 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -591,6 +591,7 @@ class Errors(object):
    E200 = ("Specifying a base model with a pretrained component '{component}' "
            "can not be combined with adding a pretrained Tok2Vec layer.")
    E201 = ("Span index out of range.")
+    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")


@add_codes
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -197,6 +197,12 @@ def test_spans_by_character(doc):
    assert span1.end_char == span2.end_char
    assert span2.label_ == "GPE"

+    # unsupported alignment mode
+    with pytest.raises(ValueError):
+        span2 = doc.char_span(
+            span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
+        )
+

 def test_span_to_array(doc):
    span = doc[1:-2]
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -379,8 +379,9 @@ cdef class Doc:
            label = self.vocab.strings.add(label)
        if not isinstance(kb_id, int):
            kb_id = self.vocab.strings.add(kb_id)
-        if alignment_mode not in ("strict", "contract", "expand"):
-            alignment_mode = "strict"
+        alignment_modes = ("strict", "contract", "expand")
+        if alignment_mode not in alignment_modes:
+            raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes)))
        cdef int start = token_by_char(self.c, self.length, start_idx)
        if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
            return None
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -200,13 +200,13 @@ Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
 > ```

 | Name                                 | Type                                     | Description                                                                                                                                                                                                                                                  |
-| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------------------------ | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `start_idx`                          | int                                      | The index of the first character of the span.                                                                                                                                                                                                                |
 | `end_idx`                            | int                                      | The index of the last character after the span.                                                                                                                                                                                                              |
 | `label`                              | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.                                                                                                                                                                                                      |
 | `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity.                                                                                                                                                                                        |
 | `vector`                             | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                                                                                                                                                                                                        |
-| `alignment_mode`                     | `str`                                    | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
+| `alignment_mode`                     | `str`                                    | How character indices snap to token boundaries. Options: "strict" (no snapping), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
 | **RETURNS**                          | `Span`                                   | The newly constructed object or `None`.                                                                                                                                                                                                                      |

 ## Doc.similarity {#similarity tag="method" model="vectors"}