Extend Doc.__init__ with additional annotation

Mostly copying from `spacy.tests.util.get_doc`, add additional kwargs to
`Doc.__init__` to initialize the most common doc/token values.
This commit is contained in:
Adriane Boyd 2020-09-21 13:01:26 +02:00
parent b9d2b29684
commit bc02e86494
5 changed files with 118 additions and 78 deletions

View File

@ -57,7 +57,10 @@ class Warnings:
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
"the Knowledge Base.")
W026 = ("Unable to set all sentence boundaries from dependency parses.")
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
"you are constructing a parse tree incrementally by setting "
"token.head values, you can probably ignore this warning. Consider "
"using Doc(words, ..., heads=heads, deps=deps) instead.")
W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple "
"smaller JSON files instead.")

View File

@ -30,60 +30,12 @@ def get_doc(
morphs=None,
):
"""Create Doc object from given vocab, words and annotations."""
if deps and not heads:
heads = [0] * len(deps)
headings = []
values = []
annotations = [pos, heads, deps, lemmas, tags, morphs]
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
for a, annot in enumerate(annotations):
if annot is not None:
if len(annot) != len(words):
raise ValueError(Errors.E189)
headings.append(possible_headings[a])
if annot is not heads:
values.extend(annot)
for value in values:
vocab.strings.add(value)
doc = Doc(vocab, words=words)
# if there are any other annotations, set them
if headings:
attrs = doc.to_array(headings)
j = 0
for annot in annotations:
if annot:
if annot is heads:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = heads[i]
else:
attrs[i, j] = heads[i]
elif annot is morphs:
for i in range(len(words)):
morph_key = vocab.morphology.add(morphs[i])
if attrs.ndim == 1:
attrs[i] = morph_key
else:
attrs[i, j] = morph_key
else:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = doc.vocab.strings[annot[i]]
else:
attrs[i, j] = doc.vocab.strings[annot[i]]
j += 1
doc.from_array(headings, attrs)
# finally, set the entities
if ents:
doc.ents = [
Span(doc, start, end, label=doc.vocab.strings[label])
for start, end, label in ents
]
return doc
if heads is not None:
heads = [i + head for i, head in enumerate(heads)]
if ents is not None:
ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
return Doc(vocab, words=words, pos=pos, heads=heads, deps=deps, tags=tags,
ents=ents, lemmas=lemmas, morphs=morphs)
def get_batch(batch_size):

View File

@ -158,17 +158,38 @@ cdef class Doc:
raise ValueError(Errors.E046.format(name=name))
return Underscore.doc_extensions.pop(name)
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
def __init__(
self,
Vocab vocab,
words=None,
spaces=None,
user_data=None,
*,
tags=None,
pos=None,
morphs=None,
lemmas=None,
heads=None,
deps=None,
ents=None,
):
"""Create a Doc object.
vocab (Vocab): A vocabulary object, which must match any models you
want to use (e.g. tokenizer, parser, entity recognizer).
words (list or None): A list of unicode strings to add to the document
words (Optional[List[str]]): A list of unicode strings to add to the document
as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as
spaces (Optional[List[bool]]): A list of boolean values, of the same length as
words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc.
tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None.
pos (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.pos. Defaults to None.
morphs (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.morph. Defaults to None.
lemmas (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.lemma. Defaults to None.
heads (Optional[List[int]]): A list of values, of the same length as words, to assign as heads. Head indices are the position of the head in the doc. Defaults to None.
deps (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.dep. Defaults to None.
ents (Optional[List[Span]]): A list of spans to assign as doc.ents. Defaults to None.
DOCS: https://nightly.spacy.io/api/doc#init
"""
@ -217,6 +238,55 @@ cdef class Doc:
lexeme = self.vocab.get_by_orth(self.mem, word)
self.push_back(lexeme, has_space)
if heads is not None:
heads = [head - i for i, head in enumerate(heads)]
if deps and not heads:
heads = [0] * len(deps)
headings = []
values = []
annotations = [pos, heads, deps, lemmas, tags, morphs]
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
for a, annot in enumerate(annotations):
if annot is not None:
if len(annot) != len(words):
raise ValueError(Errors.E189)
headings.append(possible_headings[a])
if annot is not heads:
values.extend(annot)
for value in values:
self.vocab.strings.add(value)
# if there are any other annotations, set them
if headings:
attrs = self.to_array(headings)
j = 0
for annot in annotations:
if annot:
if annot is heads:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = heads[i]
else:
attrs[i, j] = heads[i]
elif annot is morphs:
for i in range(len(words)):
morph_key = vocab.morphology.add(morphs[i])
if attrs.ndim == 1:
attrs[i] = morph_key
else:
attrs[i, j] = morph_key
else:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = self.vocab.strings[annot[i]]
else:
attrs[i, j] = self.vocab.strings[annot[i]]
j += 1
self.from_array(headings, attrs)
if ents is not None:
self.ents = ents
@property
def _(self):
"""Custom extension attributes registered via `set_extension`."""
@ -1344,7 +1414,6 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
if tokens[i].head == 0:
tokens[tokens[i].l_edge].sent_start = 1
cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
# May be called multiple times due to non-projectivity. See issues #3170
# and #4688.

View File

@ -199,13 +199,17 @@ def doc_from_conllu_sentence(
heads.append(head)
deps.append(dep)
doc = Doc(vocab, words=words, spaces=spaces)
doc = Doc(
vocab,
words=words,
spaces=spaces,
tags=tags,
pos=poses,
deps=deps,
lemmas=lemmas,
heads=heads,
)
for i in range(len(doc)):
doc[i].tag_ = tags[i]
doc[i].pos_ = poses[i]
doc[i].dep_ = deps[i]
doc[i].lemma_ = lemmas[i]
doc[i].head = doc[heads[i]]
doc[i]._.merged_orth = words[i]
doc[i]._.merged_morph = morphs[i]
doc[i]._.merged_lemma = lemmas[i]
@ -232,14 +236,17 @@ def doc_from_conllu_sentence(
heads.append(t.head.i)
deps.append(t.dep_)
doc_x = Doc(vocab, words=words, spaces=spaces)
for i in range(len(doc)):
doc_x[i].tag_ = tags[i]
doc_x[i].morph_ = morphs[i]
doc_x[i].lemma_ = lemmas[i]
doc_x[i].pos_ = poses[i]
doc_x[i].dep_ = deps[i]
doc_x[i].head = doc_x[heads[i]]
doc_x = Doc(
vocab,
words=words,
spaces=spaces,
tags=tags,
morphs=morphs,
lemmas=lemmas,
pos=poses,
deps=deps,
heads=heads,
)
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
return doc_x

View File

@ -30,11 +30,20 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
> ```
| Name | Description |
| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
| `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
| _keyword-only_ | |
| tags | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| pos | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| morphs | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| lemmas | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| heads | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
| deps | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
| ents | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~ |
## Doc.\_\_getitem\_\_ {#getitem tag="method"}