Remove side effects from Doc.__init__() (#11506)

* Remove side effects from Doc.__init__() * Changes based on review comment * Readd test * Change interface of Doc.__init__() * Simplify test Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update doc.md Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-07-26 07:59:47 +03:00 · 2022-09-26 15:58:21 +02:00 · 2022-09-26 15:58:21 +02:00 · 6f692a06d5
commit 6f692a06d5
parent f40d2fac29
4 changed files with 37 additions and 22 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -82,6 +82,21 @@ def test_issue2396(en_vocab):
    assert (span.get_lca_matrix() == matrix).all()


+@pytest.mark.issue(11499)
+def test_init_args_unmodified(en_vocab):
+    words = ["A", "sentence"]
+    ents = ["B-TYPE1", ""]
+    sent_starts = [True, False]
+    Doc(
+        vocab=en_vocab,
+        words=words,
+        ents=ents,
+        sent_starts=sent_starts,
+    )
+    assert ents == ["B-TYPE1", ""]
+    assert sent_starts == [True, False]
+
+
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
@pytest.mark.issue(2782)
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -72,7 +72,7 @@ class Doc:
        lemmas: Optional[List[str]] = ...,
        heads: Optional[List[int]] = ...,
        deps: Optional[List[str]] = ...,
-        sent_starts: Optional[List[Union[bool, None]]] = ...,
+        sent_starts: Optional[List[Union[bool, int, None]]] = ...,
        ents: Optional[List[str]] = ...,
    ) -> None: ...
    @property
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -217,9 +217,9 @@ cdef class Doc:
            head in the doc. Defaults to None.
        deps (Optional[List[str]]): A list of unicode strings, of the same
            length as words, to assign as token.dep. Defaults to None.
-        sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
-            the same length as words, to assign as token.is_sent_start. Will be
-            overridden by heads if heads is provided. Defaults to None.
+        sent_starts (Optional[List[Union[bool, int, None]]]): A list of values, 
+            of the same length as words, to assign as token.is_sent_start. Will 
+            be overridden by heads if heads is provided. Defaults to None.
        ents (Optional[List[str]]): A list of unicode strings, of the same
            length as words, as IOB tags to assign as token.ent_iob and
            token.ent_type. Defaults to None.
@ -285,6 +285,7 @@ cdef class Doc:
            heads = [0] * len(deps)
        if heads and not deps:
            raise ValueError(Errors.E1017)
+        sent_starts = list(sent_starts) if sent_starts is not None else None
        if sent_starts is not None:
            for i in range(len(sent_starts)):
                if sent_starts[i] is True:
@ -300,12 +301,11 @@ cdef class Doc:
        ent_iobs = None
        ent_types = None
        if ents is not None:
+            ents = [ent if ent != "" else None for ent in ents]
            iob_strings = Token.iob_strings()
            # make valid IOB2 out of IOB1 or IOB2
            for i, ent in enumerate(ents):
-                if ent is "":
-                    ents[i] = None
-                elif ent is not None and not isinstance(ent, str):
+                if ent is not None and not isinstance(ent, str):
                    raise ValueError(Errors.E177.format(tag=ent))
                if i < len(ents) - 1:
                    # OI -> OB
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -31,21 +31,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 > doc = Doc(nlp.vocab, words=words, spaces=spaces)
 > ```

-| Name                                     | Description                                                                                                                                                                                        |
-| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                   |
-| `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                       |
-| `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~       |
-| _keyword-only_                           |                                                                                                                                                                                                    |
-| `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                 |
-| `tags` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
-| `pos` <Tag variant="new">3</Tag>         | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
-| `morphs` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
-| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
-| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~    |
-| `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
+| Name                                     | Description                                                                                                                                                                                             |
+| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                        |
+| `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                            |
+| `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~            |
+| _keyword-only_                           |                                                                                                                                                                                                         |
+| `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                      |
+| `tags` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
+| `pos` <Tag variant="new">3</Tag>         | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
+| `morphs` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                 |
+| `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                 |
+| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~      |
+| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
+| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, int, None]]]~~ |
+| `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                        |

 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}