mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge pull request #6098 from adrianeboyd/feature/doc-init
This commit is contained in:
commit
a5f6ab4943
|
@ -57,7 +57,10 @@ class Warnings:
|
||||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||||
|
"you are constructing a parse tree incrementally by setting "
|
||||||
|
"token.head values, you can probably ignore this warning. Consider "
|
||||||
|
"using Doc(words, ..., heads=heads, deps=deps) instead.")
|
||||||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||||
"be more efficient to split your training data into multiple "
|
"be more efficient to split your training data into multiple "
|
||||||
"smaller JSON files instead.")
|
"smaller JSON files instead.")
|
||||||
|
|
|
@ -9,6 +9,26 @@ from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_api_init(en_vocab):
|
||||||
|
# set sent_start by sent_starts
|
||||||
|
doc = Doc(
|
||||||
|
en_vocab, words=["a", "b", "c", "d"], sent_starts=[True, False, True, False]
|
||||||
|
)
|
||||||
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
|
|
||||||
|
# set sent_start by heads
|
||||||
|
doc = Doc(
|
||||||
|
en_vocab, words=["a", "b", "c", "d"], heads=[0, 0, 2, 2], deps=["dep"] * 4
|
||||||
|
)
|
||||||
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
|
|
||||||
|
# heads override sent_starts
|
||||||
|
doc = Doc(
|
||||||
|
en_vocab, words=["a", "b", "c", "d"], sent_starts=[True] * 4, heads=[0, 0, 2, 2], deps=["dep"] * 4
|
||||||
|
)
|
||||||
|
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text", [["one", "two", "three"]])
|
@pytest.mark.parametrize("text", [["one", "two", "three"]])
|
||||||
def test_doc_api_compare_by_string_position(en_vocab, text):
|
def test_doc_api_compare_by_string_position(en_vocab, text):
|
||||||
doc = Doc(en_vocab, words=text)
|
doc = Doc(en_vocab, words=text)
|
||||||
|
|
|
@ -30,60 +30,21 @@ def get_doc(
|
||||||
morphs=None,
|
morphs=None,
|
||||||
):
|
):
|
||||||
"""Create Doc object from given vocab, words and annotations."""
|
"""Create Doc object from given vocab, words and annotations."""
|
||||||
if deps and not heads:
|
if heads is not None:
|
||||||
heads = [0] * len(deps)
|
heads = [i + head for i, head in enumerate(heads)]
|
||||||
headings = []
|
if ents is not None:
|
||||||
values = []
|
ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
|
||||||
annotations = [pos, heads, deps, lemmas, tags, morphs]
|
return Doc(
|
||||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
|
vocab,
|
||||||
for a, annot in enumerate(annotations):
|
words=words,
|
||||||
if annot is not None:
|
pos=pos,
|
||||||
if len(annot) != len(words):
|
heads=heads,
|
||||||
raise ValueError(Errors.E189)
|
deps=deps,
|
||||||
headings.append(possible_headings[a])
|
tags=tags,
|
||||||
if annot is not heads:
|
ents=ents,
|
||||||
values.extend(annot)
|
lemmas=lemmas,
|
||||||
for value in values:
|
morphs=morphs,
|
||||||
vocab.strings.add(value)
|
)
|
||||||
|
|
||||||
doc = Doc(vocab, words=words)
|
|
||||||
|
|
||||||
# if there are any other annotations, set them
|
|
||||||
if headings:
|
|
||||||
attrs = doc.to_array(headings)
|
|
||||||
|
|
||||||
j = 0
|
|
||||||
for annot in annotations:
|
|
||||||
if annot:
|
|
||||||
if annot is heads:
|
|
||||||
for i in range(len(words)):
|
|
||||||
if attrs.ndim == 1:
|
|
||||||
attrs[i] = heads[i]
|
|
||||||
else:
|
|
||||||
attrs[i, j] = heads[i]
|
|
||||||
elif annot is morphs:
|
|
||||||
for i in range(len(words)):
|
|
||||||
morph_key = vocab.morphology.add(morphs[i])
|
|
||||||
if attrs.ndim == 1:
|
|
||||||
attrs[i] = morph_key
|
|
||||||
else:
|
|
||||||
attrs[i, j] = morph_key
|
|
||||||
else:
|
|
||||||
for i in range(len(words)):
|
|
||||||
if attrs.ndim == 1:
|
|
||||||
attrs[i] = doc.vocab.strings[annot[i]]
|
|
||||||
else:
|
|
||||||
attrs[i, j] = doc.vocab.strings[annot[i]]
|
|
||||||
j += 1
|
|
||||||
doc.from_array(headings, attrs)
|
|
||||||
|
|
||||||
# finally, set the entities
|
|
||||||
if ents:
|
|
||||||
doc.ents = [
|
|
||||||
Span(doc, start, end, label=doc.vocab.strings[label])
|
|
||||||
for start, end, label in ents
|
|
||||||
]
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def get_batch(batch_size):
|
def get_batch(batch_size):
|
||||||
|
|
|
@ -158,17 +158,50 @@ cdef class Doc:
|
||||||
raise ValueError(Errors.E046.format(name=name))
|
raise ValueError(Errors.E046.format(name=name))
|
||||||
return Underscore.doc_extensions.pop(name)
|
return Underscore.doc_extensions.pop(name)
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
Vocab vocab,
|
||||||
|
words=None,
|
||||||
|
spaces=None,
|
||||||
|
*,
|
||||||
|
user_data=None,
|
||||||
|
tags=None,
|
||||||
|
pos=None,
|
||||||
|
morphs=None,
|
||||||
|
lemmas=None,
|
||||||
|
heads=None,
|
||||||
|
deps=None,
|
||||||
|
sent_starts=None,
|
||||||
|
ents=None,
|
||||||
|
):
|
||||||
"""Create a Doc object.
|
"""Create a Doc object.
|
||||||
|
|
||||||
vocab (Vocab): A vocabulary object, which must match any models you
|
vocab (Vocab): A vocabulary object, which must match any models you
|
||||||
want to use (e.g. tokenizer, parser, entity recognizer).
|
want to use (e.g. tokenizer, parser, entity recognizer).
|
||||||
words (list or None): A list of unicode strings to add to the document
|
words (Optional[List[str]]): A list of unicode strings to add to the document
|
||||||
as words. If `None`, defaults to empty list.
|
as words. If `None`, defaults to empty list.
|
||||||
spaces (list or None): A list of boolean values, of the same length as
|
spaces (Optional[List[bool]]): A list of boolean values, of the same length as
|
||||||
words. True means that the word is followed by a space, False means
|
words. True means that the word is followed by a space, False means
|
||||||
it is not. If `None`, defaults to `[True]*len(words)`
|
it is not. If `None`, defaults to `[True]*len(words)`
|
||||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||||
|
tags (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
|
length as words, to assign as token.tag. Defaults to None.
|
||||||
|
pos (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
|
length as words, to assign as token.pos. Defaults to None.
|
||||||
|
morphs (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
|
length as words, to assign as token.morph. Defaults to None.
|
||||||
|
lemmas (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
|
length as words, to assign as token.lemma. Defaults to None.
|
||||||
|
heads (Optional[List[int]]): A list of values, of the same length as
|
||||||
|
words, to assign as heads. Head indices are the position of the
|
||||||
|
head in the doc. Defaults to None.
|
||||||
|
deps (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
|
length as words, to assign as token.dep. Defaults to None.
|
||||||
|
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
||||||
|
the same length as words, to assign as token.is_sent_start. Will be
|
||||||
|
overridden by heads if heads is provided. Defaults to None.
|
||||||
|
ents (Optional[List[Span]]): A list of spans to assign as doc.ents.
|
||||||
|
Defaults to None.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/doc#init
|
DOCS: https://nightly.spacy.io/api/doc#init
|
||||||
"""
|
"""
|
||||||
|
@ -217,6 +250,63 @@ cdef class Doc:
|
||||||
lexeme = self.vocab.get_by_orth(self.mem, word)
|
lexeme = self.vocab.get_by_orth(self.mem, word)
|
||||||
self.push_back(lexeme, has_space)
|
self.push_back(lexeme, has_space)
|
||||||
|
|
||||||
|
if heads is not None:
|
||||||
|
heads = [head - i for i, head in enumerate(heads)]
|
||||||
|
if deps and not heads:
|
||||||
|
heads = [0] * len(deps)
|
||||||
|
if sent_starts is not None:
|
||||||
|
for i in range(len(sent_starts)):
|
||||||
|
if sent_starts[i] is True:
|
||||||
|
sent_starts[i] = 1
|
||||||
|
elif sent_starts[i] is False:
|
||||||
|
sent_starts[i] = -1
|
||||||
|
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
|
||||||
|
sent_starts[i] = 0
|
||||||
|
headings = []
|
||||||
|
values = []
|
||||||
|
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
|
||||||
|
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
|
||||||
|
for a, annot in enumerate(annotations):
|
||||||
|
if annot is not None:
|
||||||
|
if len(annot) != len(words):
|
||||||
|
raise ValueError(Errors.E189)
|
||||||
|
headings.append(possible_headings[a])
|
||||||
|
if annot is not heads and annot is not sent_starts:
|
||||||
|
values.extend(annot)
|
||||||
|
for value in values:
|
||||||
|
self.vocab.strings.add(value)
|
||||||
|
|
||||||
|
# if there are any other annotations, set them
|
||||||
|
if headings:
|
||||||
|
attrs = self.to_array(headings)
|
||||||
|
|
||||||
|
j = 0
|
||||||
|
for annot in annotations:
|
||||||
|
if annot:
|
||||||
|
if annot is heads or annot is sent_starts:
|
||||||
|
for i in range(len(words)):
|
||||||
|
if attrs.ndim == 1:
|
||||||
|
attrs[i] = annot[i]
|
||||||
|
else:
|
||||||
|
attrs[i, j] = annot[i]
|
||||||
|
elif annot is morphs:
|
||||||
|
for i in range(len(words)):
|
||||||
|
morph_key = vocab.morphology.add(morphs[i])
|
||||||
|
if attrs.ndim == 1:
|
||||||
|
attrs[i] = morph_key
|
||||||
|
else:
|
||||||
|
attrs[i, j] = morph_key
|
||||||
|
else:
|
||||||
|
for i in range(len(words)):
|
||||||
|
if attrs.ndim == 1:
|
||||||
|
attrs[i] = self.vocab.strings[annot[i]]
|
||||||
|
else:
|
||||||
|
attrs[i, j] = self.vocab.strings[annot[i]]
|
||||||
|
j += 1
|
||||||
|
self.from_array(headings, attrs)
|
||||||
|
if ents is not None:
|
||||||
|
self.ents = ents
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _(self):
|
def _(self):
|
||||||
"""Custom extension attributes registered via `set_extension`."""
|
"""Custom extension attributes registered via `set_extension`."""
|
||||||
|
|
|
@ -199,13 +199,17 @@ def doc_from_conllu_sentence(
|
||||||
heads.append(head)
|
heads.append(head)
|
||||||
deps.append(dep)
|
deps.append(dep)
|
||||||
|
|
||||||
doc = Doc(vocab, words=words, spaces=spaces)
|
doc = Doc(
|
||||||
|
vocab,
|
||||||
|
words=words,
|
||||||
|
spaces=spaces,
|
||||||
|
tags=tags,
|
||||||
|
pos=poses,
|
||||||
|
deps=deps,
|
||||||
|
lemmas=lemmas,
|
||||||
|
heads=heads,
|
||||||
|
)
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
doc[i].tag_ = tags[i]
|
|
||||||
doc[i].pos_ = poses[i]
|
|
||||||
doc[i].dep_ = deps[i]
|
|
||||||
doc[i].lemma_ = lemmas[i]
|
|
||||||
doc[i].head = doc[heads[i]]
|
|
||||||
doc[i]._.merged_orth = words[i]
|
doc[i]._.merged_orth = words[i]
|
||||||
doc[i]._.merged_morph = morphs[i]
|
doc[i]._.merged_morph = morphs[i]
|
||||||
doc[i]._.merged_lemma = lemmas[i]
|
doc[i]._.merged_lemma = lemmas[i]
|
||||||
|
@ -232,14 +236,17 @@ def doc_from_conllu_sentence(
|
||||||
heads.append(t.head.i)
|
heads.append(t.head.i)
|
||||||
deps.append(t.dep_)
|
deps.append(t.dep_)
|
||||||
|
|
||||||
doc_x = Doc(vocab, words=words, spaces=spaces)
|
doc_x = Doc(
|
||||||
for i in range(len(doc)):
|
vocab,
|
||||||
doc_x[i].tag_ = tags[i]
|
words=words,
|
||||||
doc_x[i].morph_ = morphs[i]
|
spaces=spaces,
|
||||||
doc_x[i].lemma_ = lemmas[i]
|
tags=tags,
|
||||||
doc_x[i].pos_ = poses[i]
|
morphs=morphs,
|
||||||
doc_x[i].dep_ = deps[i]
|
lemmas=lemmas,
|
||||||
doc_x[i].head = doc_x[heads[i]]
|
pos=poses,
|
||||||
|
deps=deps,
|
||||||
|
heads=heads,
|
||||||
|
)
|
||||||
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
||||||
|
|
||||||
return doc_x
|
return doc_x
|
||||||
|
|
|
@ -31,10 +31,20 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||||
|
| tags | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
| pos | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
| morphs | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
| lemmas | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
| heads | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||||
|
| deps | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
|
| sent_starts | A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||||
|
| ents | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~ |
|
||||||
|
|
||||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user