mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Remove side effects from Doc.__init__() (#11506)
* Remove side effects from Doc.__init__() * Changes based on review comment * Readd test * Change interface of Doc.__init__() * Simplify test Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update doc.md Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
		
							parent
							
								
									f40d2fac29
								
							
						
					
					
						commit
						6f692a06d5
					
				|  | @ -82,6 +82,21 @@ def test_issue2396(en_vocab): | ||||||
|     assert (span.get_lca_matrix() == matrix).all() |     assert (span.get_lca_matrix() == matrix).all() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.issue(11499) | ||||||
|  | def test_init_args_unmodified(en_vocab): | ||||||
|  |     words = ["A", "sentence"] | ||||||
|  |     ents = ["B-TYPE1", ""] | ||||||
|  |     sent_starts = [True, False] | ||||||
|  |     Doc( | ||||||
|  |         vocab=en_vocab, | ||||||
|  |         words=words, | ||||||
|  |         ents=ents, | ||||||
|  |         sent_starts=sent_starts, | ||||||
|  |     ) | ||||||
|  |     assert ents == ["B-TYPE1", ""] | ||||||
|  |     assert sent_starts == [True, False] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) | @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) | ||||||
| @pytest.mark.parametrize("lang_cls", [English, MultiLanguage]) | @pytest.mark.parametrize("lang_cls", [English, MultiLanguage]) | ||||||
| @pytest.mark.issue(2782) | @pytest.mark.issue(2782) | ||||||
|  |  | ||||||
|  | @ -72,7 +72,7 @@ class Doc: | ||||||
|         lemmas: Optional[List[str]] = ..., |         lemmas: Optional[List[str]] = ..., | ||||||
|         heads: Optional[List[int]] = ..., |         heads: Optional[List[int]] = ..., | ||||||
|         deps: Optional[List[str]] = ..., |         deps: Optional[List[str]] = ..., | ||||||
|         sent_starts: Optional[List[Union[bool, None]]] = ..., |         sent_starts: Optional[List[Union[bool, int, None]]] = ..., | ||||||
|         ents: Optional[List[str]] = ..., |         ents: Optional[List[str]] = ..., | ||||||
|     ) -> None: ... |     ) -> None: ... | ||||||
|     @property |     @property | ||||||
|  |  | ||||||
|  | @ -217,9 +217,9 @@ cdef class Doc: | ||||||
|             head in the doc. Defaults to None. |             head in the doc. Defaults to None. | ||||||
|         deps (Optional[List[str]]): A list of unicode strings, of the same |         deps (Optional[List[str]]): A list of unicode strings, of the same | ||||||
|             length as words, to assign as token.dep. Defaults to None. |             length as words, to assign as token.dep. Defaults to None. | ||||||
|         sent_starts (Optional[List[Union[bool, None]]]): A list of values, of |         sent_starts (Optional[List[Union[bool, int, None]]]): A list of values,  | ||||||
|             the same length as words, to assign as token.is_sent_start. Will be |             of the same length as words, to assign as token.is_sent_start. Will  | ||||||
|             overridden by heads if heads is provided. Defaults to None. |             be overridden by heads if heads is provided. Defaults to None. | ||||||
|         ents (Optional[List[str]]): A list of unicode strings, of the same |         ents (Optional[List[str]]): A list of unicode strings, of the same | ||||||
|             length as words, as IOB tags to assign as token.ent_iob and |             length as words, as IOB tags to assign as token.ent_iob and | ||||||
|             token.ent_type. Defaults to None. |             token.ent_type. Defaults to None. | ||||||
|  | @ -285,6 +285,7 @@ cdef class Doc: | ||||||
|             heads = [0] * len(deps) |             heads = [0] * len(deps) | ||||||
|         if heads and not deps: |         if heads and not deps: | ||||||
|             raise ValueError(Errors.E1017) |             raise ValueError(Errors.E1017) | ||||||
|  |         sent_starts = list(sent_starts) if sent_starts is not None else None | ||||||
|         if sent_starts is not None: |         if sent_starts is not None: | ||||||
|             for i in range(len(sent_starts)): |             for i in range(len(sent_starts)): | ||||||
|                 if sent_starts[i] is True: |                 if sent_starts[i] is True: | ||||||
|  | @ -300,12 +301,11 @@ cdef class Doc: | ||||||
|         ent_iobs = None |         ent_iobs = None | ||||||
|         ent_types = None |         ent_types = None | ||||||
|         if ents is not None: |         if ents is not None: | ||||||
|  |             ents = [ent if ent != "" else None for ent in ents] | ||||||
|             iob_strings = Token.iob_strings() |             iob_strings = Token.iob_strings() | ||||||
|             # make valid IOB2 out of IOB1 or IOB2 |             # make valid IOB2 out of IOB1 or IOB2 | ||||||
|             for i, ent in enumerate(ents): |             for i, ent in enumerate(ents): | ||||||
|                 if ent is "": |                 if ent is not None and not isinstance(ent, str): | ||||||
|                     ents[i] = None |  | ||||||
|                 elif ent is not None and not isinstance(ent, str): |  | ||||||
|                     raise ValueError(Errors.E177.format(tag=ent)) |                     raise ValueError(Errors.E177.format(tag=ent)) | ||||||
|                 if i < len(ents) - 1: |                 if i < len(ents) - 1: | ||||||
|                     # OI -> OB |                     # OI -> OB | ||||||
|  |  | ||||||
|  | @ -32,7 +32,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Name                                     | Description                                                                                                                                                                                             | | | Name                                     | Description                                                                                                                                                                                             | | ||||||
| | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                        | | | `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                        | | ||||||
| | `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                            | | | `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                            | | ||||||
| | `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~            | | | `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~            | | ||||||
|  | @ -44,7 +44,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | ||||||
| | `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                 | | | `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                 | | ||||||
| | `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~      | | | `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~      | | ||||||
| | `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   | | | `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   | | ||||||
| | `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~    | | | `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, int, None]]]~~ | | ||||||
| | `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                        | | | `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                        | | ||||||
| 
 | 
 | ||||||
| ## Doc.\_\_getitem\_\_ {#getitem tag="method"} | ## Doc.\_\_getitem\_\_ {#getitem tag="method"} | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user