mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Add sent_starts to Doc.__init__
Add sent_starts to `Doc.__init__`. Officially specify `is_sent_start` values but also convert to and accept `sent_start` internally.
This commit is contained in:
parent
6aa91c7ca0
commit
f212303729
|
@ -9,6 +9,26 @@ from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_api_init(en_vocab):
|
||||||
|
# set sent_start by sent_starts
|
||||||
|
doc = Doc(
|
||||||
|
en_vocab, words=["a", "b", "c", "d"], sent_starts=[True, False, True, False]
|
||||||
|
)
|
||||||
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
|
|
||||||
|
# set sent_start by heads
|
||||||
|
doc = Doc(
|
||||||
|
en_vocab, words=["a", "b", "c", "d"], heads=[0, 0, 2, 2], deps=["dep"] * 4
|
||||||
|
)
|
||||||
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
|
|
||||||
|
# heads override sent_starts
|
||||||
|
doc = Doc(
|
||||||
|
en_vocab, words=["a", "b", "c", "d"], sent_starts=[True] * 4, heads=[0, 0, 2, 2], deps=["dep"] * 4
|
||||||
|
)
|
||||||
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text", [["one", "two", "three"]])
|
@pytest.mark.parametrize("text", [["one", "two", "three"]])
|
||||||
def test_doc_api_compare_by_string_position(en_vocab, text):
|
def test_doc_api_compare_by_string_position(en_vocab, text):
|
||||||
doc = Doc(en_vocab, words=text)
|
doc = Doc(en_vocab, words=text)
|
||||||
|
|
|
@ -171,6 +171,7 @@ cdef class Doc:
|
||||||
lemmas=None,
|
lemmas=None,
|
||||||
heads=None,
|
heads=None,
|
||||||
deps=None,
|
deps=None,
|
||||||
|
sent_starts=None,
|
||||||
ents=None,
|
ents=None,
|
||||||
):
|
):
|
||||||
"""Create a Doc object.
|
"""Create a Doc object.
|
||||||
|
@ -183,13 +184,24 @@ cdef class Doc:
|
||||||
words. True means that the word is followed by a space, False means
|
words. True means that the word is followed by a space, False means
|
||||||
it is not. If `None`, defaults to `[True]*len(words)`
|
it is not. If `None`, defaults to `[True]*len(words)`
|
||||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||||
tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None.
|
tags (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
pos (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.pos. Defaults to None.
|
length as words, to assign as token.tag. Defaults to None.
|
||||||
morphs (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.morph. Defaults to None.
|
pos (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
lemmas (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.lemma. Defaults to None.
|
length as words, to assign as token.pos. Defaults to None.
|
||||||
heads (Optional[List[int]]): A list of values, of the same length as words, to assign as heads. Head indices are the position of the head in the doc. Defaults to None.
|
morphs (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
deps (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.dep. Defaults to None.
|
length as words, to assign as token.morph. Defaults to None.
|
||||||
ents (Optional[List[Span]]): A list of spans to assign as doc.ents. Defaults to None.
|
lemmas (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
|
length as words, to assign as token.lemma. Defaults to None.
|
||||||
|
heads (Optional[List[int]]): A list of values, of the same length as
|
||||||
|
words, to assign as heads. Head indices are the position of the
|
||||||
|
head in the doc. Defaults to None.
|
||||||
|
deps (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
|
length as words, to assign as token.dep. Defaults to None.
|
||||||
|
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
||||||
|
the same length as words, to assign as token.is_sent_start. Will be
|
||||||
|
overridden by heads if heads is provided. Defaults to None.
|
||||||
|
ents (Optional[List[Span]]): A list of spans to assign as doc.ents.
|
||||||
|
Defaults to None.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/doc#init
|
DOCS: https://nightly.spacy.io/api/doc#init
|
||||||
"""
|
"""
|
||||||
|
@ -242,16 +254,24 @@ cdef class Doc:
|
||||||
heads = [head - i for i, head in enumerate(heads)]
|
heads = [head - i for i, head in enumerate(heads)]
|
||||||
if deps and not heads:
|
if deps and not heads:
|
||||||
heads = [0] * len(deps)
|
heads = [0] * len(deps)
|
||||||
|
if sent_starts is not None:
|
||||||
|
for i in range(len(sent_starts)):
|
||||||
|
if sent_starts[i] is True:
|
||||||
|
sent_starts[i] = 1
|
||||||
|
elif sent_starts[i] is False:
|
||||||
|
sent_starts[i] = -1
|
||||||
|
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
|
||||||
|
sent_starts[i] = 0
|
||||||
headings = []
|
headings = []
|
||||||
values = []
|
values = []
|
||||||
annotations = [pos, heads, deps, lemmas, tags, morphs]
|
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
|
||||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
|
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
|
||||||
for a, annot in enumerate(annotations):
|
for a, annot in enumerate(annotations):
|
||||||
if annot is not None:
|
if annot is not None:
|
||||||
if len(annot) != len(words):
|
if len(annot) != len(words):
|
||||||
raise ValueError(Errors.E189)
|
raise ValueError(Errors.E189)
|
||||||
headings.append(possible_headings[a])
|
headings.append(possible_headings[a])
|
||||||
if annot is not heads:
|
if annot is not heads and annot is not sent_starts:
|
||||||
values.extend(annot)
|
values.extend(annot)
|
||||||
for value in values:
|
for value in values:
|
||||||
self.vocab.strings.add(value)
|
self.vocab.strings.add(value)
|
||||||
|
@ -263,12 +283,12 @@ cdef class Doc:
|
||||||
j = 0
|
j = 0
|
||||||
for annot in annotations:
|
for annot in annotations:
|
||||||
if annot:
|
if annot:
|
||||||
if annot is heads:
|
if annot is heads or annot is sent_starts:
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
attrs[i] = heads[i]
|
attrs[i] = annot[i]
|
||||||
else:
|
else:
|
||||||
attrs[i, j] = heads[i]
|
attrs[i, j] = annot[i]
|
||||||
elif annot is morphs:
|
elif annot is morphs:
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
morph_key = vocab.morphology.add(morphs[i])
|
morph_key = vocab.morphology.add(morphs[i])
|
||||||
|
|
|
@ -43,6 +43,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
| lemmas | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| lemmas | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| heads | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
|
| heads | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||||
| deps | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| deps | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
| sent_starts | A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||||
| ents | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~ |
|
| ents | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~ |
|
||||||
|
|
||||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user