mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Update sent_starts in Example.from_dict (#7847)
* Update sent_starts in Example.from_dict Update `sent_starts` for `Example.from_dict` so that `Optional[bool]` values have the same meaning as for `Token.is_sent_start`. Use `Optional[bool]` as the type for sent start values in the docs. * Use helper function for conversion to ternary ints
This commit is contained in:
parent
f4339f9bff
commit
f68fc29130
|
@ -2,6 +2,7 @@ import pytest
|
|||
from spacy.training.example import Example
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.util import to_ternary_int
|
||||
|
||||
|
||||
def test_Example_init_requires_doc_objects():
|
||||
|
@ -121,7 +122,7 @@ def test_Example_from_dict_with_morphology(annots):
|
|||
[
|
||||
{
|
||||
"words": ["This", "is", "one", "sentence", "this", "is", "another"],
|
||||
"sent_starts": [1, 0, 0, 0, 1, 0, 0],
|
||||
"sent_starts": [1, False, 0, None, True, -1, -5.7],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
@ -131,7 +132,12 @@ def test_Example_from_dict_with_sent_start(annots):
|
|||
example = Example.from_dict(predicted, annots)
|
||||
assert len(list(example.reference.sents)) == 2
|
||||
for i, token in enumerate(example.reference):
|
||||
assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
|
||||
if to_ternary_int(annots["sent_starts"][i]) == 1:
|
||||
assert token.is_sent_start is True
|
||||
elif to_ternary_int(annots["sent_starts"][i]) == 0:
|
||||
assert token.is_sent_start is None
|
||||
else:
|
||||
assert token.is_sent_start is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
|
@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans
|
|||
from ..errors import Errors, Warnings
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..tokens.token cimport MISSING_DEP
|
||||
from ..util import logger
|
||||
from ..util import logger, to_ternary_int
|
||||
|
||||
|
||||
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
||||
|
@ -338,7 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
||||
elif key == "SENT_START":
|
||||
attrs.append(key)
|
||||
values.append(value)
|
||||
values.append([to_ternary_int(v) for v in value])
|
||||
elif key == "MORPH":
|
||||
attrs.append(key)
|
||||
values.append([vocab.morphology.add(v) for v in value])
|
||||
|
|
|
@ -1526,3 +1526,18 @@ def check_lexeme_norms(vocab, component_name):
|
|||
if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
|
||||
langs = ", ".join(LEXEME_NORM_LANGS)
|
||||
logger.debug(Warnings.W033.format(model=component_name, langs=langs))
|
||||
|
||||
|
||||
def to_ternary_int(val) -> int:
|
||||
"""Convert a value to the ternary 1/0/-1 int used for True/None/False in
|
||||
attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
|
||||
(None), any other values are -1 (False).
|
||||
"""
|
||||
if isinstance(val, float):
|
||||
val = int(val)
|
||||
if val is True or val is 1:
|
||||
return 1
|
||||
elif val is None or val is 0:
|
||||
return 0
|
||||
else:
|
||||
return -1
|
||||
|
|
|
@ -390,7 +390,7 @@ file to keep track of your settings and hyperparameters and your own
|
|||
> "tags": List[str],
|
||||
> "pos": List[str],
|
||||
> "morphs": List[str],
|
||||
> "sent_starts": List[bool],
|
||||
> "sent_starts": List[Optional[bool]],
|
||||
> "deps": List[string],
|
||||
> "heads": List[int],
|
||||
> "entities": List[str],
|
||||
|
|
|
@ -44,7 +44,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
|||
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~ |
|
||||
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
|
||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
|
|
@ -364,7 +364,7 @@ unknown. Defaults to `True` for the first token in the `Doc`.
|
|||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------- |
|
||||
| **RETURNS** | Whether the token starts a sentence. ~~bool~~ |
|
||||
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
|
||||
|
||||
## Token.has_vector {#has_vector tag="property" model="vectors"}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user