Remove side effects from Doc.__init__() (#11506)

* Remove side effects from Doc.__init__()

* Changes based on review comment

* Readd test

* Change interface of Doc.__init__()

* Simplify test

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update doc.md

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Richard Hudson 2022-09-26 15:58:21 +02:00 committed by GitHub
parent f40d2fac29
commit 6f692a06d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 37 additions and 22 deletions

View File

@ -82,6 +82,21 @@ def test_issue2396(en_vocab):
assert (span.get_lca_matrix() == matrix).all()
@pytest.mark.issue(11499)
def test_init_args_unmodified(en_vocab):
words = ["A", "sentence"]
ents = ["B-TYPE1", ""]
sent_starts = [True, False]
Doc(
vocab=en_vocab,
words=words,
ents=ents,
sent_starts=sent_starts,
)
assert ents == ["B-TYPE1", ""]
assert sent_starts == [True, False]
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
@pytest.mark.issue(2782)

View File

@ -72,7 +72,7 @@ class Doc:
lemmas: Optional[List[str]] = ...,
heads: Optional[List[int]] = ...,
deps: Optional[List[str]] = ...,
sent_starts: Optional[List[Union[bool, None]]] = ...,
sent_starts: Optional[List[Union[bool, int, None]]] = ...,
ents: Optional[List[str]] = ...,
) -> None: ...
@property

View File

@ -217,9 +217,9 @@ cdef class Doc:
head in the doc. Defaults to None.
deps (Optional[List[str]]): A list of unicode strings, of the same
length as words, to assign as token.dep. Defaults to None.
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
the same length as words, to assign as token.is_sent_start. Will be
overridden by heads if heads is provided. Defaults to None.
sent_starts (Optional[List[Union[bool, int, None]]]): A list of values,
of the same length as words, to assign as token.is_sent_start. Will
be overridden by heads if heads is provided. Defaults to None.
ents (Optional[List[str]]): A list of unicode strings, of the same
length as words, as IOB tags to assign as token.ent_iob and
token.ent_type. Defaults to None.
@ -285,6 +285,7 @@ cdef class Doc:
heads = [0] * len(deps)
if heads and not deps:
raise ValueError(Errors.E1017)
sent_starts = list(sent_starts) if sent_starts is not None else None
if sent_starts is not None:
for i in range(len(sent_starts)):
if sent_starts[i] is True:
@ -300,12 +301,11 @@ cdef class Doc:
ent_iobs = None
ent_types = None
if ents is not None:
ents = [ent if ent != "" else None for ent in ents]
iob_strings = Token.iob_strings()
# make valid IOB2 out of IOB1 or IOB2
for i, ent in enumerate(ents):
if ent is "":
ents[i] = None
elif ent is not None and not isinstance(ent, str):
if ent is not None and not isinstance(ent, str):
raise ValueError(Errors.E177.format(tag=ent))
if i < len(ents) - 1:
# OI -> OB

View File

@ -31,21 +31,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
> ```
| Name | Description |
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
| _keyword-only_ | |
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~ |
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
| Name | Description |
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
| _keyword-only_ | |
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, int, None]]]~~ |
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
## Doc.\_\_getitem\_\_ {#getitem tag="method"}