mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-05 14:10:34 +03:00
Update sent_starts in Example.from_dict (#7847)
* Update sent_starts in Example.from_dict Update `sent_starts` for `Example.from_dict` so that `Optional[bool]` values have the same meaning as for `Token.is_sent_start`. Use `Optional[bool]` as the type for sent start values in the docs. * Use helper function for conversion to ternary ints
This commit is contained in:
parent
f4339f9bff
commit
f68fc29130
|
@ -2,6 +2,7 @@ import pytest
|
||||||
from spacy.training.example import Example
|
from spacy.training.example import Example
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.util import to_ternary_int
|
||||||
|
|
||||||
|
|
||||||
def test_Example_init_requires_doc_objects():
|
def test_Example_init_requires_doc_objects():
|
||||||
|
@ -121,7 +122,7 @@ def test_Example_from_dict_with_morphology(annots):
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"words": ["This", "is", "one", "sentence", "this", "is", "another"],
|
"words": ["This", "is", "one", "sentence", "this", "is", "another"],
|
||||||
"sent_starts": [1, 0, 0, 0, 1, 0, 0],
|
"sent_starts": [1, False, 0, None, True, -1, -5.7],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -131,7 +132,12 @@ def test_Example_from_dict_with_sent_start(annots):
|
||||||
example = Example.from_dict(predicted, annots)
|
example = Example.from_dict(predicted, annots)
|
||||||
assert len(list(example.reference.sents)) == 2
|
assert len(list(example.reference.sents)) == 2
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
|
if to_ternary_int(annots["sent_starts"][i]) == 1:
|
||||||
|
assert token.is_sent_start is True
|
||||||
|
elif to_ternary_int(annots["sent_starts"][i]) == 0:
|
||||||
|
assert token.is_sent_start is None
|
||||||
|
else:
|
||||||
|
assert token.is_sent_start is False
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|
|
@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..tokens.token cimport MISSING_DEP
|
from ..tokens.token cimport MISSING_DEP
|
||||||
from ..util import logger
|
from ..util import logger, to_ternary_int
|
||||||
|
|
||||||
|
|
||||||
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
||||||
|
@ -338,7 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
||||||
elif key == "SENT_START":
|
elif key == "SENT_START":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append(value)
|
values.append([to_ternary_int(v) for v in value])
|
||||||
elif key == "MORPH":
|
elif key == "MORPH":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
values.append([vocab.morphology.add(v) for v in value])
|
||||||
|
|
|
@ -1526,3 +1526,18 @@ def check_lexeme_norms(vocab, component_name):
|
||||||
if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
|
if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
|
||||||
langs = ", ".join(LEXEME_NORM_LANGS)
|
langs = ", ".join(LEXEME_NORM_LANGS)
|
||||||
logger.debug(Warnings.W033.format(model=component_name, langs=langs))
|
logger.debug(Warnings.W033.format(model=component_name, langs=langs))
|
||||||
|
|
||||||
|
|
||||||
|
def to_ternary_int(val) -> int:
|
||||||
|
"""Convert a value to the ternary 1/0/-1 int used for True/None/False in
|
||||||
|
attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
|
||||||
|
(None), any other values are -1 (False).
|
||||||
|
"""
|
||||||
|
if isinstance(val, float):
|
||||||
|
val = int(val)
|
||||||
|
if val is True or val is 1:
|
||||||
|
return 1
|
||||||
|
elif val is None or val is 0:
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
|
|
|
@ -390,7 +390,7 @@ file to keep track of your settings and hyperparameters and your own
|
||||||
> "tags": List[str],
|
> "tags": List[str],
|
||||||
> "pos": List[str],
|
> "pos": List[str],
|
||||||
> "morphs": List[str],
|
> "morphs": List[str],
|
||||||
> "sent_starts": List[bool],
|
> "sent_starts": List[Optional[bool]],
|
||||||
> "deps": List[string],
|
> "deps": List[string],
|
||||||
> "heads": List[int],
|
> "heads": List[int],
|
||||||
> "entities": List[str],
|
> "entities": List[str],
|
||||||
|
|
|
@ -44,7 +44,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~ |
|
||||||
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
|
||||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
|
@ -364,7 +364,7 @@ unknown. Defaults to `True` for the first token in the `Doc`.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | --------------------------------------------- |
|
| ----------- | --------------------------------------------- |
|
||||||
| **RETURNS** | Whether the token starts a sentence. ~~bool~~ |
|
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
|
||||||
|
|
||||||
## Token.has_vector {#has_vector tag="property" model="vectors"}
|
## Token.has_vector {#has_vector tag="property" model="vectors"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user