mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Remove side effects from Doc.__init__() (#11506)
* Remove side effects from Doc.__init__() * Changes based on review comment * Readd test * Change interface of Doc.__init__() * Simplify test Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update doc.md Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
f40d2fac29
commit
6f692a06d5
|
@ -82,6 +82,21 @@ def test_issue2396(en_vocab):
|
|||
assert (span.get_lca_matrix() == matrix).all()
|
||||
|
||||
|
||||
@pytest.mark.issue(11499)
|
||||
def test_init_args_unmodified(en_vocab):
|
||||
words = ["A", "sentence"]
|
||||
ents = ["B-TYPE1", ""]
|
||||
sent_starts = [True, False]
|
||||
Doc(
|
||||
vocab=en_vocab,
|
||||
words=words,
|
||||
ents=ents,
|
||||
sent_starts=sent_starts,
|
||||
)
|
||||
assert ents == ["B-TYPE1", ""]
|
||||
assert sent_starts == [True, False]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
|
||||
@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
|
||||
@pytest.mark.issue(2782)
|
||||
|
|
|
@ -72,7 +72,7 @@ class Doc:
|
|||
lemmas: Optional[List[str]] = ...,
|
||||
heads: Optional[List[int]] = ...,
|
||||
deps: Optional[List[str]] = ...,
|
||||
sent_starts: Optional[List[Union[bool, None]]] = ...,
|
||||
sent_starts: Optional[List[Union[bool, int, None]]] = ...,
|
||||
ents: Optional[List[str]] = ...,
|
||||
) -> None: ...
|
||||
@property
|
||||
|
|
|
@ -217,9 +217,9 @@ cdef class Doc:
|
|||
head in the doc. Defaults to None.
|
||||
deps (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.dep. Defaults to None.
|
||||
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
||||
the same length as words, to assign as token.is_sent_start. Will be
|
||||
overridden by heads if heads is provided. Defaults to None.
|
||||
sent_starts (Optional[List[Union[bool, int, None]]]): A list of values,
|
||||
of the same length as words, to assign as token.is_sent_start. Will
|
||||
be overridden by heads if heads is provided. Defaults to None.
|
||||
ents (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, as IOB tags to assign as token.ent_iob and
|
||||
token.ent_type. Defaults to None.
|
||||
|
@ -285,6 +285,7 @@ cdef class Doc:
|
|||
heads = [0] * len(deps)
|
||||
if heads and not deps:
|
||||
raise ValueError(Errors.E1017)
|
||||
sent_starts = list(sent_starts) if sent_starts is not None else None
|
||||
if sent_starts is not None:
|
||||
for i in range(len(sent_starts)):
|
||||
if sent_starts[i] is True:
|
||||
|
@ -300,12 +301,11 @@ cdef class Doc:
|
|||
ent_iobs = None
|
||||
ent_types = None
|
||||
if ents is not None:
|
||||
ents = [ent if ent != "" else None for ent in ents]
|
||||
iob_strings = Token.iob_strings()
|
||||
# make valid IOB2 out of IOB1 or IOB2
|
||||
for i, ent in enumerate(ents):
|
||||
if ent is "":
|
||||
ents[i] = None
|
||||
elif ent is not None and not isinstance(ent, str):
|
||||
if ent is not None and not isinstance(ent, str):
|
||||
raise ValueError(Errors.E177.format(tag=ent))
|
||||
if i < len(ents) - 1:
|
||||
# OI -> OB
|
||||
|
|
|
@ -31,21 +31,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
|||
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~ |
|
||||
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, int, None]]]~~ |
|
||||
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
|
||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user