From bc02e864943a790cfc7ec991c67d20cc774417df Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Sep 2020 13:01:26 +0200 Subject: [PATCH 1/4] Extend Doc.__init__ with additional annotation Mostly copying from `spacy.tests.util.get_doc`, add additional kwargs to `Doc.__init__` to initialize the most common doc/token values. --- spacy/errors.py | 5 +- spacy/tests/util.py | 60 ++---------------- spacy/tokens/doc.pyx | 77 ++++++++++++++++++++++-- spacy/training/converters/conllu2docs.py | 35 ++++++----- website/docs/api/doc.md | 19 ++++-- 5 files changed, 118 insertions(+), 78 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 81e3616be..f219496a5 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -57,7 +57,10 @@ class Warnings: "incorrect. Modify PhraseMatcher._terminal_hash to fix.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " "the Knowledge Base.") - W026 = ("Unable to set all sentence boundaries from dependency parses.") + W026 = ("Unable to set all sentence boundaries from dependency parses. If " + "you are constructing a parse tree incrementally by setting " + "token.head values, you can probably ignore this warning. Consider " + "using Doc(words, ..., heads=heads, deps=deps) instead.") W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " "smaller JSON files instead.") diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 741753c89..7bc32bf34 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -30,60 +30,12 @@ def get_doc( morphs=None, ): """Create Doc object from given vocab, words and annotations.""" - if deps and not heads: - heads = [0] * len(deps) - headings = [] - values = [] - annotations = [pos, heads, deps, lemmas, tags, morphs] - possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH] - for a, annot in enumerate(annotations): - if annot is not None: - if len(annot) != len(words): - raise ValueError(Errors.E189) - headings.append(possible_headings[a]) - if annot is not heads: - values.extend(annot) - for value in values: - vocab.strings.add(value) - - doc = Doc(vocab, words=words) - - # if there are any other annotations, set them - if headings: - attrs = doc.to_array(headings) - - j = 0 - for annot in annotations: - if annot: - if annot is heads: - for i in range(len(words)): - if attrs.ndim == 1: - attrs[i] = heads[i] - else: - attrs[i, j] = heads[i] - elif annot is morphs: - for i in range(len(words)): - morph_key = vocab.morphology.add(morphs[i]) - if attrs.ndim == 1: - attrs[i] = morph_key - else: - attrs[i, j] = morph_key - else: - for i in range(len(words)): - if attrs.ndim == 1: - attrs[i] = doc.vocab.strings[annot[i]] - else: - attrs[i, j] = doc.vocab.strings[annot[i]] - j += 1 - doc.from_array(headings, attrs) - - # finally, set the entities - if ents: - doc.ents = [ - Span(doc, start, end, label=doc.vocab.strings[label]) - for start, end, label in ents - ] - return doc + if heads is not None: + heads = [i + head for i, head in enumerate(heads)] + if ents is not None: + ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents] + return Doc(vocab, words=words, pos=pos, heads=heads, deps=deps, tags=tags, + ents=ents, lemmas=lemmas, morphs=morphs) def get_batch(batch_size): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2d9de278b..de7e0f862 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -158,17 +158,38 @@ cdef class Doc: raise ValueError(Errors.E046.format(name=name)) return Underscore.doc_extensions.pop(name) - def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None): + def __init__( + self, + Vocab vocab, + words=None, + spaces=None, + user_data=None, + *, + tags=None, + pos=None, + morphs=None, + lemmas=None, + heads=None, + deps=None, + ents=None, + ): """Create a Doc object. vocab (Vocab): A vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer). - words (list or None): A list of unicode strings to add to the document + words (Optional[List[str]]): A list of unicode strings to add to the document as words. If `None`, defaults to empty list. - spaces (list or None): A list of boolean values, of the same length as + spaces (Optional[List[bool]]): A list of boolean values, of the same length as words. True means that the word is followed by a space, False means it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. + tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None. + pos (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.pos. Defaults to None. + morphs (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.morph. Defaults to None. + lemmas (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.lemma. Defaults to None. + heads (Optional[List[int]]): A list of values, of the same length as words, to assign as heads. Head indices are the position of the head in the doc. Defaults to None. + deps (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.dep. Defaults to None. + ents (Optional[List[Span]]): A list of spans to assign as doc.ents. Defaults to None. DOCS: https://nightly.spacy.io/api/doc#init """ @@ -217,6 +238,55 @@ cdef class Doc: lexeme = self.vocab.get_by_orth(self.mem, word) self.push_back(lexeme, has_space) + if heads is not None: + heads = [head - i for i, head in enumerate(heads)] + if deps and not heads: + heads = [0] * len(deps) + headings = [] + values = [] + annotations = [pos, heads, deps, lemmas, tags, morphs] + possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH] + for a, annot in enumerate(annotations): + if annot is not None: + if len(annot) != len(words): + raise ValueError(Errors.E189) + headings.append(possible_headings[a]) + if annot is not heads: + values.extend(annot) + for value in values: + self.vocab.strings.add(value) + + # if there are any other annotations, set them + if headings: + attrs = self.to_array(headings) + + j = 0 + for annot in annotations: + if annot: + if annot is heads: + for i in range(len(words)): + if attrs.ndim == 1: + attrs[i] = heads[i] + else: + attrs[i, j] = heads[i] + elif annot is morphs: + for i in range(len(words)): + morph_key = vocab.morphology.add(morphs[i]) + if attrs.ndim == 1: + attrs[i] = morph_key + else: + attrs[i, j] = morph_key + else: + for i in range(len(words)): + if attrs.ndim == 1: + attrs[i] = self.vocab.strings[annot[i]] + else: + attrs[i, j] = self.vocab.strings[annot[i]] + j += 1 + self.from_array(headings, attrs) + if ents is not None: + self.ents = ents + @property def _(self): """Custom extension attributes registered via `set_extension`.""" @@ -1344,7 +1414,6 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1: if tokens[i].head == 0: tokens[tokens[i].l_edge].sent_start = 1 - cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1: # May be called multiple times due to non-projectivity. See issues #3170 # and #4688. diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu2docs.py index ebd123375..b4d8b3ac4 100644 --- a/spacy/training/converters/conllu2docs.py +++ b/spacy/training/converters/conllu2docs.py @@ -199,13 +199,17 @@ def doc_from_conllu_sentence( heads.append(head) deps.append(dep) - doc = Doc(vocab, words=words, spaces=spaces) + doc = Doc( + vocab, + words=words, + spaces=spaces, + tags=tags, + pos=poses, + deps=deps, + lemmas=lemmas, + heads=heads, + ) for i in range(len(doc)): - doc[i].tag_ = tags[i] - doc[i].pos_ = poses[i] - doc[i].dep_ = deps[i] - doc[i].lemma_ = lemmas[i] - doc[i].head = doc[heads[i]] doc[i]._.merged_orth = words[i] doc[i]._.merged_morph = morphs[i] doc[i]._.merged_lemma = lemmas[i] @@ -232,14 +236,17 @@ def doc_from_conllu_sentence( heads.append(t.head.i) deps.append(t.dep_) - doc_x = Doc(vocab, words=words, spaces=spaces) - for i in range(len(doc)): - doc_x[i].tag_ = tags[i] - doc_x[i].morph_ = morphs[i] - doc_x[i].lemma_ = lemmas[i] - doc_x[i].pos_ = poses[i] - doc_x[i].dep_ = deps[i] - doc_x[i].head = doc_x[heads[i]] + doc_x = Doc( + vocab, + words=words, + spaces=spaces, + tags=tags, + morphs=morphs, + lemmas=lemmas, + pos=poses, + deps=deps, + heads=heads, + ) doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] return doc_x diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 380f6a172..680523c60 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -30,11 +30,20 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the > doc = Doc(nlp.vocab, words=words, spaces=spaces) > ``` -| Name | Description | -| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | A storage container for lexical types. ~~Vocab~~ | -| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | -| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | A storage container for lexical types. ~~Vocab~~ | +| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | +| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | +| `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ | +| _keyword-only_ | | +| tags | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| pos | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| morphs | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| lemmas | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| heads | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ | +| deps | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| ents | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~ | ## Doc.\_\_getitem\_\_ {#getitem tag="method"} From ce455f30ca847fc8038d034f39977cb6f3ed53c3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Sep 2020 13:52:46 +0200 Subject: [PATCH 2/4] Fix formatting --- spacy/tests/util.py | 13 +++++++++++-- spacy/tokens/doc.pyx | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 7bc32bf34..6c67d2ee1 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -34,8 +34,17 @@ def get_doc( heads = [i + head for i, head in enumerate(heads)] if ents is not None: ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents] - return Doc(vocab, words=words, pos=pos, heads=heads, deps=deps, tags=tags, - ents=ents, lemmas=lemmas, morphs=morphs) + return Doc( + vocab, + words=words, + pos=pos, + heads=heads, + deps=deps, + tags=tags, + ents=ents, + lemmas=lemmas, + morphs=morphs, + ) def get_batch(batch_size): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index de7e0f862..13167c2d4 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1414,6 +1414,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1: if tokens[i].head == 0: tokens[tokens[i].l_edge].sent_start = 1 + cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1: # May be called multiple times due to non-projectivity. See issues #3170 # and #4688. From 6aa91c7ca02acd0df8d5dfba236faf09c3a5a477 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Sep 2020 16:00:06 +0200 Subject: [PATCH 3/4] Make user_data keyword-only --- spacy/tokens/doc.pyx | 2 +- website/docs/api/doc.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 13167c2d4..27efa6cef 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -163,8 +163,8 @@ cdef class Doc: Vocab vocab, words=None, spaces=None, - user_data=None, *, + user_data=None, tags=None, pos=None, morphs=None, diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 680523c60..baf264b80 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -35,8 +35,8 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | `vocab` | A storage container for lexical types. ~~Vocab~~ | | `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | -| `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ | | _keyword-only_ | | +| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | | tags | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | pos | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | morphs | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | From f212303729cb0775bb00eebb6eef0a6c646f92da Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Sep 2020 17:59:09 +0200 Subject: [PATCH 4/4] Add sent_starts to Doc.__init__ Add sent_starts to `Doc.__init__`. Officially specify `is_sent_start` values but also convert to and accept `sent_start` internally. --- spacy/tests/doc/test_doc_api.py | 20 ++++++++++++++ spacy/tokens/doc.pyx | 46 +++++++++++++++++++++++---------- website/docs/api/doc.md | 1 + 3 files changed, 54 insertions(+), 13 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index c979931b1..0579642c4 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -9,6 +9,26 @@ from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH from ..util import get_doc +def test_doc_api_init(en_vocab): + # set sent_start by sent_starts + doc = Doc( + en_vocab, words=["a", "b", "c", "d"], sent_starts=[True, False, True, False] + ) + assert [t.is_sent_start for t in doc] == [True, False, True, False] + + # set sent_start by heads + doc = Doc( + en_vocab, words=["a", "b", "c", "d"], heads=[0, 0, 2, 2], deps=["dep"] * 4 + ) + assert [t.is_sent_start for t in doc] == [True, False, True, False] + + # heads override sent_starts + doc = Doc( + en_vocab, words=["a", "b", "c", "d"], sent_starts=[True] * 4, heads=[0, 0, 2, 2], deps=["dep"] * 4 + ) + assert [t.is_sent_start for t in doc] == [True, False, True, False] + + @pytest.mark.parametrize("text", [["one", "two", "three"]]) def test_doc_api_compare_by_string_position(en_vocab, text): doc = Doc(en_vocab, words=text) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 27efa6cef..c5f1f6801 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -171,6 +171,7 @@ cdef class Doc: lemmas=None, heads=None, deps=None, + sent_starts=None, ents=None, ): """Create a Doc object. @@ -183,13 +184,24 @@ cdef class Doc: words. True means that the word is followed by a space, False means it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. - tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None. - pos (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.pos. Defaults to None. - morphs (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.morph. Defaults to None. - lemmas (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.lemma. Defaults to None. - heads (Optional[List[int]]): A list of values, of the same length as words, to assign as heads. Head indices are the position of the head in the doc. Defaults to None. - deps (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.dep. Defaults to None. - ents (Optional[List[Span]]): A list of spans to assign as doc.ents. Defaults to None. + tags (Optional[List[str]]): A list of unicode strings, of the same + length as words, to assign as token.tag. Defaults to None. + pos (Optional[List[str]]): A list of unicode strings, of the same + length as words, to assign as token.pos. Defaults to None. + morphs (Optional[List[str]]): A list of unicode strings, of the same + length as words, to assign as token.morph. Defaults to None. + lemmas (Optional[List[str]]): A list of unicode strings, of the same + length as words, to assign as token.lemma. Defaults to None. + heads (Optional[List[int]]): A list of values, of the same length as + words, to assign as heads. Head indices are the position of the + head in the doc. Defaults to None. + deps (Optional[List[str]]): A list of unicode strings, of the same + length as words, to assign as token.dep. Defaults to None. + sent_starts (Optional[List[Union[bool, None]]]): A list of values, of + the same length as words, to assign as token.is_sent_start. Will be + overridden by heads if heads is provided. Defaults to None. + ents (Optional[List[Span]]): A list of spans to assign as doc.ents. + Defaults to None. DOCS: https://nightly.spacy.io/api/doc#init """ @@ -242,16 +254,24 @@ cdef class Doc: heads = [head - i for i, head in enumerate(heads)] if deps and not heads: heads = [0] * len(deps) + if sent_starts is not None: + for i in range(len(sent_starts)): + if sent_starts[i] is True: + sent_starts[i] = 1 + elif sent_starts[i] is False: + sent_starts[i] = -1 + elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]: + sent_starts[i] = 0 headings = [] values = [] - annotations = [pos, heads, deps, lemmas, tags, morphs] - possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH] + annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts] + possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START] for a, annot in enumerate(annotations): if annot is not None: if len(annot) != len(words): raise ValueError(Errors.E189) headings.append(possible_headings[a]) - if annot is not heads: + if annot is not heads and annot is not sent_starts: values.extend(annot) for value in values: self.vocab.strings.add(value) @@ -263,12 +283,12 @@ cdef class Doc: j = 0 for annot in annotations: if annot: - if annot is heads: + if annot is heads or annot is sent_starts: for i in range(len(words)): if attrs.ndim == 1: - attrs[i] = heads[i] + attrs[i] = annot[i] else: - attrs[i, j] = heads[i] + attrs[i, j] = annot[i] elif annot is morphs: for i in range(len(words)): morph_key = vocab.morphology.add(morphs[i]) diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index baf264b80..52f94a83d 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -43,6 +43,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | lemmas | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | heads | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ | | deps | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| sent_starts | A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ | | ents | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~ | ## Doc.\_\_getitem\_\_ {#getitem tag="method"}