Update sent_starts in Example.from_dict (#7847)

* Update sent_starts in Example.from_dict Update `sent_starts` for `Example.from_dict` so that `Optional[bool]` values have the same meaning as for `Token.is_sent_start`. Use `Optional[bool]` as the type for sent start values in the docs. * Use helper function for conversion to ternary ints
2026-01-09 02:01:22 +03:00 · 2021-04-22 11:32:45 +02:00 · 2021-04-22 11:32:45 +02:00 · f68fc29130
commit f68fc29130
parent f4339f9bff
6 changed files with 28 additions and 7 deletions
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@ -2,6 +2,7 @@ import pytest
 from spacy.training.example import Example
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
+from spacy.util import to_ternary_int


 def test_Example_init_requires_doc_objects():
@ -121,7 +122,7 @@ def test_Example_from_dict_with_morphology(annots):
    [
        {
            "words": ["This", "is", "one", "sentence", "this", "is", "another"],
-            "sent_starts": [1, 0, 0, 0, 1, 0, 0],
+            "sent_starts": [1, False, 0, None, True, -1, -5.7],
        }
    ],
 )
@ -131,7 +132,12 @@ def test_Example_from_dict_with_sent_start(annots):
    example = Example.from_dict(predicted, annots)
    assert len(list(example.reference.sents)) == 2
    for i, token in enumerate(example.reference):
-        assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
+        if to_ternary_int(annots["sent_starts"][i]) == 1:
+            assert token.is_sent_start is True
+        elif to_ternary_int(annots["sent_starts"][i]) == 0:
+            assert token.is_sent_start is None
+        else:
+            assert token.is_sent_start is False


@pytest.mark.parametrize(
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans
 from ..errors import Errors, Warnings
 from ..pipeline._parser_internals import nonproj
 from ..tokens.token cimport MISSING_DEP
-from ..util import logger
+from ..util import logger, to_ternary_int


 cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
@ -338,7 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
        elif key == "SENT_START":
            attrs.append(key)
-            values.append(value)
+            values.append([to_ternary_int(v) for v in value])
        elif key == "MORPH":
            attrs.append(key)
            values.append([vocab.morphology.add(v) for v in value])
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1526,3 +1526,18 @@ def check_lexeme_norms(vocab, component_name):
    if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
        langs = ", ".join(LEXEME_NORM_LANGS)
        logger.debug(Warnings.W033.format(model=component_name, langs=langs))
+
+
+def to_ternary_int(val) -> int:
+    """Convert a value to the ternary 1/0/-1 int used for True/None/False in
+    attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
+    (None), any other values are -1 (False).
+    """
+    if isinstance(val, float):
+        val = int(val)
+    if val is True or val is 1:
+        return 1
+    elif val is None or val is 0:
+        return 0
+    else:
+        return -1
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -390,7 +390,7 @@ file to keep track of your settings and hyperparameters and your own
 >    "tags": List[str],
 >    "pos": List[str],
 >    "morphs": List[str],
->    "sent_starts": List[bool],
+>    "sent_starts": List[Optional[bool]],
 >    "deps": List[string],
 >    "heads": List[int],
 >    "entities": List[str],
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -44,7 +44,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
 | `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
 | `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
-| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~  |
+| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~    |
 | `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |

 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -364,7 +364,7 @@ unknown. Defaults to `True` for the first token in the `Doc`.

 | Name        | Description                                   |
 | ----------- | --------------------------------------------- |
-| **RETURNS** | Whether the token starts a sentence. ~~bool~~ |
+| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |

 ## Token.has_vector {#has_vector tag="property" model="vectors"}