mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Remove side effects from Doc.__init__() (#11506)
* Remove side effects from Doc.__init__() * Changes based on review comment * Readd test * Change interface of Doc.__init__() * Simplify test Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update doc.md Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
f40d2fac29
commit
6f692a06d5
|
@ -82,6 +82,21 @@ def test_issue2396(en_vocab):
|
||||||
assert (span.get_lca_matrix() == matrix).all()
|
assert (span.get_lca_matrix() == matrix).all()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(11499)
|
||||||
|
def test_init_args_unmodified(en_vocab):
|
||||||
|
words = ["A", "sentence"]
|
||||||
|
ents = ["B-TYPE1", ""]
|
||||||
|
sent_starts = [True, False]
|
||||||
|
Doc(
|
||||||
|
vocab=en_vocab,
|
||||||
|
words=words,
|
||||||
|
ents=ents,
|
||||||
|
sent_starts=sent_starts,
|
||||||
|
)
|
||||||
|
assert ents == ["B-TYPE1", ""]
|
||||||
|
assert sent_starts == [True, False]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
|
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
|
||||||
@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
|
@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
|
||||||
@pytest.mark.issue(2782)
|
@pytest.mark.issue(2782)
|
||||||
|
|
|
@ -72,7 +72,7 @@ class Doc:
|
||||||
lemmas: Optional[List[str]] = ...,
|
lemmas: Optional[List[str]] = ...,
|
||||||
heads: Optional[List[int]] = ...,
|
heads: Optional[List[int]] = ...,
|
||||||
deps: Optional[List[str]] = ...,
|
deps: Optional[List[str]] = ...,
|
||||||
sent_starts: Optional[List[Union[bool, None]]] = ...,
|
sent_starts: Optional[List[Union[bool, int, None]]] = ...,
|
||||||
ents: Optional[List[str]] = ...,
|
ents: Optional[List[str]] = ...,
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -217,9 +217,9 @@ cdef class Doc:
|
||||||
head in the doc. Defaults to None.
|
head in the doc. Defaults to None.
|
||||||
deps (Optional[List[str]]): A list of unicode strings, of the same
|
deps (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
length as words, to assign as token.dep. Defaults to None.
|
length as words, to assign as token.dep. Defaults to None.
|
||||||
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
sent_starts (Optional[List[Union[bool, int, None]]]): A list of values,
|
||||||
the same length as words, to assign as token.is_sent_start. Will be
|
of the same length as words, to assign as token.is_sent_start. Will
|
||||||
overridden by heads if heads is provided. Defaults to None.
|
be overridden by heads if heads is provided. Defaults to None.
|
||||||
ents (Optional[List[str]]): A list of unicode strings, of the same
|
ents (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
length as words, as IOB tags to assign as token.ent_iob and
|
length as words, as IOB tags to assign as token.ent_iob and
|
||||||
token.ent_type. Defaults to None.
|
token.ent_type. Defaults to None.
|
||||||
|
@ -285,6 +285,7 @@ cdef class Doc:
|
||||||
heads = [0] * len(deps)
|
heads = [0] * len(deps)
|
||||||
if heads and not deps:
|
if heads and not deps:
|
||||||
raise ValueError(Errors.E1017)
|
raise ValueError(Errors.E1017)
|
||||||
|
sent_starts = list(sent_starts) if sent_starts is not None else None
|
||||||
if sent_starts is not None:
|
if sent_starts is not None:
|
||||||
for i in range(len(sent_starts)):
|
for i in range(len(sent_starts)):
|
||||||
if sent_starts[i] is True:
|
if sent_starts[i] is True:
|
||||||
|
@ -300,12 +301,11 @@ cdef class Doc:
|
||||||
ent_iobs = None
|
ent_iobs = None
|
||||||
ent_types = None
|
ent_types = None
|
||||||
if ents is not None:
|
if ents is not None:
|
||||||
|
ents = [ent if ent != "" else None for ent in ents]
|
||||||
iob_strings = Token.iob_strings()
|
iob_strings = Token.iob_strings()
|
||||||
# make valid IOB2 out of IOB1 or IOB2
|
# make valid IOB2 out of IOB1 or IOB2
|
||||||
for i, ent in enumerate(ents):
|
for i, ent in enumerate(ents):
|
||||||
if ent is "":
|
if ent is not None and not isinstance(ent, str):
|
||||||
ents[i] = None
|
|
||||||
elif ent is not None and not isinstance(ent, str):
|
|
||||||
raise ValueError(Errors.E177.format(tag=ent))
|
raise ValueError(Errors.E177.format(tag=ent))
|
||||||
if i < len(ents) - 1:
|
if i < len(ents) - 1:
|
||||||
# OI -> OB
|
# OI -> OB
|
||||||
|
|
|
@ -31,21 +31,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||||
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
|
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
|
||||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~ |
|
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, int, None]]]~~ |
|
||||||
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `ents` <Tag variant="new">3</Tag> | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
|
||||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user