mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Update sent_starts in Example.from_dict (#7847)
* Update sent_starts in Example.from_dict Update `sent_starts` for `Example.from_dict` so that `Optional[bool]` values have the same meaning as for `Token.is_sent_start`. Use `Optional[bool]` as the type for sent start values in the docs. * Use helper function for conversion to ternary ints
This commit is contained in:
		
							parent
							
								
									f4339f9bff
								
							
						
					
					
						commit
						f68fc29130
					
				| 
						 | 
					@ -2,6 +2,7 @@ import pytest
 | 
				
			||||||
from spacy.training.example import Example
 | 
					from spacy.training.example import Example
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					from spacy.util import to_ternary_int
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_Example_init_requires_doc_objects():
 | 
					def test_Example_init_requires_doc_objects():
 | 
				
			||||||
| 
						 | 
					@ -121,7 +122,7 @@ def test_Example_from_dict_with_morphology(annots):
 | 
				
			||||||
    [
 | 
					    [
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            "words": ["This", "is", "one", "sentence", "this", "is", "another"],
 | 
					            "words": ["This", "is", "one", "sentence", "this", "is", "another"],
 | 
				
			||||||
            "sent_starts": [1, 0, 0, 0, 1, 0, 0],
 | 
					            "sent_starts": [1, False, 0, None, True, -1, -5.7],
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
| 
						 | 
					@ -131,7 +132,12 @@ def test_Example_from_dict_with_sent_start(annots):
 | 
				
			||||||
    example = Example.from_dict(predicted, annots)
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
    assert len(list(example.reference.sents)) == 2
 | 
					    assert len(list(example.reference.sents)) == 2
 | 
				
			||||||
    for i, token in enumerate(example.reference):
 | 
					    for i, token in enumerate(example.reference):
 | 
				
			||||||
        assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
 | 
					        if to_ternary_int(annots["sent_starts"][i]) == 1:
 | 
				
			||||||
 | 
					            assert token.is_sent_start is True
 | 
				
			||||||
 | 
					        elif to_ternary_int(annots["sent_starts"][i]) == 0:
 | 
				
			||||||
 | 
					            assert token.is_sent_start is None
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            assert token.is_sent_start is False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..pipeline._parser_internals import nonproj
 | 
					from ..pipeline._parser_internals import nonproj
 | 
				
			||||||
from ..tokens.token cimport MISSING_DEP
 | 
					from ..tokens.token cimport MISSING_DEP
 | 
				
			||||||
from ..util import logger
 | 
					from ..util import logger, to_ternary_int
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
 | 
					cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
 | 
				
			||||||
| 
						 | 
					@ -338,7 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
 | 
				
			||||||
            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
 | 
					            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
 | 
				
			||||||
        elif key == "SENT_START":
 | 
					        elif key == "SENT_START":
 | 
				
			||||||
            attrs.append(key)
 | 
					            attrs.append(key)
 | 
				
			||||||
            values.append(value)
 | 
					            values.append([to_ternary_int(v) for v in value])
 | 
				
			||||||
        elif key == "MORPH":
 | 
					        elif key == "MORPH":
 | 
				
			||||||
            attrs.append(key)
 | 
					            attrs.append(key)
 | 
				
			||||||
            values.append([vocab.morphology.add(v) for v in value])
 | 
					            values.append([vocab.morphology.add(v) for v in value])
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1526,3 +1526,18 @@ def check_lexeme_norms(vocab, component_name):
 | 
				
			||||||
    if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
 | 
					    if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
 | 
				
			||||||
        langs = ", ".join(LEXEME_NORM_LANGS)
 | 
					        langs = ", ".join(LEXEME_NORM_LANGS)
 | 
				
			||||||
        logger.debug(Warnings.W033.format(model=component_name, langs=langs))
 | 
					        logger.debug(Warnings.W033.format(model=component_name, langs=langs))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def to_ternary_int(val) -> int:
 | 
				
			||||||
 | 
					    """Convert a value to the ternary 1/0/-1 int used for True/None/False in
 | 
				
			||||||
 | 
					    attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
 | 
				
			||||||
 | 
					    (None), any other values are -1 (False).
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if isinstance(val, float):
 | 
				
			||||||
 | 
					        val = int(val)
 | 
				
			||||||
 | 
					    if val is True or val is 1:
 | 
				
			||||||
 | 
					        return 1
 | 
				
			||||||
 | 
					    elif val is None or val is 0:
 | 
				
			||||||
 | 
					        return 0
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        return -1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -390,7 +390,7 @@ file to keep track of your settings and hyperparameters and your own
 | 
				
			||||||
>    "tags": List[str],
 | 
					>    "tags": List[str],
 | 
				
			||||||
>    "pos": List[str],
 | 
					>    "pos": List[str],
 | 
				
			||||||
>    "morphs": List[str],
 | 
					>    "morphs": List[str],
 | 
				
			||||||
>    "sent_starts": List[bool],
 | 
					>    "sent_starts": List[Optional[bool]],
 | 
				
			||||||
>    "deps": List[string],
 | 
					>    "deps": List[string],
 | 
				
			||||||
>    "heads": List[int],
 | 
					>    "heads": List[int],
 | 
				
			||||||
>    "entities": List[str],
 | 
					>    "entities": List[str],
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -44,7 +44,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | 
				
			||||||
| `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
 | 
					| `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
 | 
				
			||||||
| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
 | 
					| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
 | 
				
			||||||
| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
 | 
					| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
 | 
				
			||||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~  |
 | 
					| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~    |
 | 
				
			||||||
| `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 | 
					| `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 | 
					## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -364,7 +364,7 @@ unknown. Defaults to `True` for the first token in the `Doc`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name        | Description                                   |
 | 
					| Name        | Description                                   |
 | 
				
			||||||
| ----------- | --------------------------------------------- |
 | 
					| ----------- | --------------------------------------------- |
 | 
				
			||||||
| **RETURNS** | Whether the token starts a sentence. ~~bool~~ |
 | 
					| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Token.has_vector {#has_vector tag="property" model="vectors"}
 | 
					## Token.has_vector {#has_vector tag="property" model="vectors"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user