Update sent_starts in Example.from_dict (#7847)

* Update sent_starts in Example.from_dict

Update `sent_starts` for `Example.from_dict` so that `Optional[bool]`
values have the same meaning as for `Token.is_sent_start`.

Use `Optional[bool]` as the type for sent start values in the docs.

* Use helper function for conversion to ternary ints
This commit is contained in:
Adriane Boyd 2021-04-22 11:32:45 +02:00 committed by GitHub
parent f4339f9bff
commit f68fc29130
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 28 additions and 7 deletions

View File

@ -2,6 +2,7 @@ import pytest
from spacy.training.example import Example
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.util import to_ternary_int
def test_Example_init_requires_doc_objects():
@ -121,7 +122,7 @@ def test_Example_from_dict_with_morphology(annots):
[
{
"words": ["This", "is", "one", "sentence", "this", "is", "another"],
"sent_starts": [1, 0, 0, 0, 1, 0, 0],
"sent_starts": [1, False, 0, None, True, -1, -5.7],
}
],
)
@ -131,7 +132,12 @@ def test_Example_from_dict_with_sent_start(annots):
example = Example.from_dict(predicted, annots)
assert len(list(example.reference.sents)) == 2
for i, token in enumerate(example.reference):
assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
if to_ternary_int(annots["sent_starts"][i]) == 1:
assert token.is_sent_start is True
elif to_ternary_int(annots["sent_starts"][i]) == 0:
assert token.is_sent_start is None
else:
assert token.is_sent_start is False
@pytest.mark.parametrize(

View File

@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings
from ..pipeline._parser_internals import nonproj
from ..tokens.token cimport MISSING_DEP
from ..util import logger
from ..util import logger, to_ternary_int
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
@ -338,7 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
elif key == "SENT_START":
attrs.append(key)
values.append(value)
values.append([to_ternary_int(v) for v in value])
elif key == "MORPH":
attrs.append(key)
values.append([vocab.morphology.add(v) for v in value])

View File

@ -1526,3 +1526,18 @@ def check_lexeme_norms(vocab, component_name):
if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
langs = ", ".join(LEXEME_NORM_LANGS)
logger.debug(Warnings.W033.format(model=component_name, langs=langs))
def to_ternary_int(val) -> int:
"""Convert a value to the ternary 1/0/-1 int used for True/None/False in
attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
(None), any other values are -1 (False).
"""
if isinstance(val, float):
val = int(val)
if val is True or val is 1:
return 1
elif val is None or val is 0:
return 0
else:
return -1

View File

@ -390,7 +390,7 @@ file to keep track of your settings and hyperparameters and your own
> "tags": List[str],
> "pos": List[str],
> "morphs": List[str],
> "sent_starts": List[bool],
> "sent_starts": List[Optional[bool]],
> "deps": List[string],
> "heads": List[int],
> "entities": List[str],

View File

@ -44,7 +44,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~ |
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
## Doc.\_\_getitem\_\_ {#getitem tag="method"}

View File

@ -364,7 +364,7 @@ unknown. Defaults to `True` for the first token in the `Doc`.
| Name | Description |
| ----------- | --------------------------------------------- |
| **RETURNS** | Whether the token starts a sentence. ~~bool~~ |
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
## Token.has_vector {#has_vector tag="property" model="vectors"}