From 1abeca90a62f164a2492b92fe7e15c9da51b7a88 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 7 Jan 2021 18:58:13 +0100 Subject: [PATCH 1/7] refer to _parser_internals.nonproj.DELIMITER --- spacy/cli/debug_data.py | 3 ++- spacy/pipeline/dep_parser.pyx | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index d23cd3717..8eabf1f8f 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -12,6 +12,7 @@ from ..training import Example from ..training.initialize import get_sourced_components from ..schemas import ConfigSchemaTraining from ..pipeline._parser_internals import nonproj +from ..pipeline._parser_internals.nonproj import DELIMITER from ..language import Language from ..util import registry, resolve_dot_names from .. import util @@ -383,7 +384,7 @@ def debug_data( # rare labels in projectivized train rare_projectivized_labels = [] for label in gold_train_data["deps"]: - if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label: + if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and DELIMITER in label: rare_projectivized_labels.append( f"{label}: {gold_train_data['deps'][label]}" ) diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 1fe29eb9b..18c9fd25a 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -9,6 +9,7 @@ from ._parser_internals.arc_eager cimport ArcEager from .functions import merge_subtokens from ..language import Language from ._parser_internals import nonproj +from ._parser_internals.nonproj import DELIMITER from ..scorer import Scorer from ..training import validate_examples @@ -230,8 +231,8 @@ cdef class DependencyParser(Parser): for move in self.move_names: if "-" in move: label = move.split("-")[1] - if "||" in label: - label = label.split("||")[1] + if DELIMITER in label: + label = label.split(DELIMITER)[1] labels.add(label) return tuple(sorted(labels)) From dd12c6c8fda95c5346f8e8c74c30664bc37af6f0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 7 Jan 2021 19:10:32 +0100 Subject: [PATCH 2/7] allow missing information in deps and heads annotations --- .../pipeline/_parser_internals/arc_eager.pyx | 5 +-- spacy/symbols.pxd | 1 + spacy/symbols.pyx | 1 + spacy/tests/doc/test_array.py | 8 +++- spacy/tests/parser/test_parse.py | 38 ++++++++++++++++++- spacy/tokens/doc.pyx | 8 ++-- spacy/tokens/token.pyx | 9 +++-- spacy/training/example.pyx | 16 ++++++-- 8 files changed, 71 insertions(+), 15 deletions(-) diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 90a70b17b..463980051 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -195,8 +195,7 @@ cdef class ArcEagerGold: def __init__(self, ArcEager moves, StateClass stcls, Example example): self.mem = Pool() heads, labels = example.get_aligned_parse(projectivize=True) - labels = [label if label is not None else "" for label in labels] - labels = [example.x.vocab.strings.add(label) for label in labels] + labels = [example.x.vocab.strings.add(label) if label is not None else 0 for label in labels] sent_starts = example.get_aligned_sent_starts() assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts)) self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts) @@ -783,7 +782,7 @@ cdef class ArcEager(TransitionSystem): for i in range(self.n_moves): print(self.get_class_name(i), is_valid[i], costs[i]) print("Gold sent starts?", is_sent_start(&gold_state, state.B(0)), is_sent_start(&gold_state, state.B(1))) - raise ValueError + raise ValueError("Could not find gold transition - see logs above.") def get_oracle_sequence_from_state(self, StateClass state, ArcEagerGold gold, _debug=None): cdef int i diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index bc15d9b80..e28322cb5 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -467,3 +467,4 @@ cdef enum symbol_t: IDX _ + MISSING_LABEL diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index b0345c710..83c693c98 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -466,6 +466,7 @@ IDS = { "LAW": LAW, "MORPH": MORPH, "_": _, + "MISSING_LABEL": MISSING_LABEL, } diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index ef54c581c..92b9620ff 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -98,10 +98,16 @@ def test_doc_from_array_heads_in_bounds(en_vocab): doc_from_array = Doc(en_vocab, words=words) doc_from_array.from_array(["HEAD"], arr) - # head before start + # head before start is used to denote a missing value arr = doc.to_array(["HEAD"]) arr[0] = -1 doc_from_array = Doc(en_vocab, words=words) + doc_from_array.from_array(["HEAD"], arr) + + # other negative values are invalid + arr = doc.to_array(["HEAD"]) + arr[0] = -2 + doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index e7728baaf..437cc760c 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -45,7 +45,17 @@ CONFLICTING_DATA = [ ), ] -eps = 0.01 +PARTIAL_DATA = [ + ( + "I like London.", + { + "heads": [1, 1, 1, None], + "deps": ["nsubj", "ROOT", "dobj", None], + }, + ), +] + +eps = 0.1 def test_parser_root(en_vocab): @@ -205,6 +215,32 @@ def test_parser_set_sent_starts(en_vocab): assert token.head in sent +@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) +def test_incomplete_data(pipe_name): + # Test that the parser works with incomplete information + nlp = English() + parser = nlp.add_pipe(pipe_name) + train_examples = [] + for text, annotations in PARTIAL_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for dep in annotations.get("deps", []): + if dep is not None: + parser.add_label(dep) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(150): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + assert losses[pipe_name] < 0.0001 + + # test the trained model + test_text = "I like securities." + doc = nlp(test_text) + assert doc[0].dep_ == "nsubj" + assert doc[2].dep_ == "dobj" + assert doc[0].head.i == 1 + assert doc[2].head.i == 1 + + @pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) def test_overfitting_IO(pipe_name): # Simple test to try and quickly overfit the dependency parser (normal or beam) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 9eedf214b..92344b6c8 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -266,7 +266,7 @@ cdef class Doc: self.push_back(lexeme, has_space) if heads is not None: - heads = [head - i for i, head in enumerate(heads)] + heads = [head - i if head is not None else None for i, head in enumerate(heads)] if deps and not heads: heads = [0] * len(deps) if sent_starts is not None: @@ -328,7 +328,8 @@ cdef class Doc: if annot is not heads and annot is not sent_starts and annot is not ent_iobs: values.extend(annot) for value in values: - self.vocab.strings.add(value) + if value is not None: + self.vocab.strings.add(value) # if there are any other annotations, set them if headings: @@ -1039,7 +1040,8 @@ cdef class Doc: # cast index to signed int abs_head_index = values[col * stride + i] abs_head_index += i - if abs_head_index < 0 or abs_head_index >= length: + # abs_head_index -1 refers to missing value + if abs_head_index < -1 or abs_head_index >= length: raise ValueError( Errors.E190.format( index=i, diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 2075c3cc8..c52f7da1b 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -639,13 +639,16 @@ cdef class Token: return any(ancestor.i == self.i for ancestor in descendant.ancestors) property head: - """The syntactic parent, or "governor", of this token. + """The syntactic parent, or "governor", of this token. RETURNS (Token): The token predicted by the parser to be the head of - the current token. + the current token. Returns None if unknown. """ def __get__(self): - return self.doc[self.i + self.c.head] + head_i = self.i + self.c.head + if head_i == -1: + return None + return self.doc[head_i] def __set__(self, Token new_head): # This function sets the head of self to new_head and updates the diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 21907e7dd..fc5cd8e26 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -11,6 +11,7 @@ from .alignment import Alignment from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings +from ..symbols import MISSING_LABEL from ..pipeline._parser_internals import nonproj from ..util import logger @@ -179,14 +180,18 @@ cdef class Example: gold_to_cand = self.alignment.y2x aligned_heads = [None] * self.x.length aligned_deps = [None] * self.x.length - heads = [token.head.i for token in self.y] + heads = [token.head.i if token.head is not None else -1 for token in self.y] deps = [token.dep_ for token in self.y] if projectivize: - heads, deps = nonproj.projectivize(heads, deps) + proj_heads, proj_deps = nonproj.projectivize(heads, deps) + # don't touch the missing data + heads = [h if heads[i] != -1 else -1 for i, h in enumerate(proj_heads)] + MISSING = self.x.vocab.strings[MISSING_LABEL] + deps = [d if deps[i] != MISSING else MISSING for i, d in enumerate(proj_deps)] for cand_i in range(self.x.length): if cand_to_gold.lengths[cand_i] == 1: gold_i = cand_to_gold[cand_i].dataXd[0, 0] - if gold_to_cand.lengths[heads[gold_i]] == 1: + if heads[gold_i] != -1 and gold_to_cand.lengths[heads[gold_i]] == 1: aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0]) aligned_deps[cand_i] = deps[gold_i] return aligned_heads, aligned_deps @@ -329,7 +334,10 @@ def _annot2array(vocab, tok_annot, doc_annot): pass elif key == "HEAD": attrs.append(key) - values.append([h-i for i, h in enumerate(value)]) + values.append([h-i if h is not None else -(i+1) for i, h in enumerate(value)]) + elif key == "DEP": + attrs.append(key) + values.append([vocab.strings.add(h) if h is not None else MISSING_LABEL for h in value]) elif key == "SENT_START": attrs.append(key) values.append(value) From a581d82f33742fbff264d1ded3b55c7e65d00abb Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 12 Jan 2021 17:17:06 +0100 Subject: [PATCH 3/7] introduce token.has_head and refer to MISSING_DEP_ (WIP) --- spacy/symbols.pxd | 1 - spacy/symbols.pyx | 1 - spacy/tests/doc/test_array.py | 8 +------- spacy/tests/doc/test_retokenize_merge.py | 9 ++++++--- spacy/tests/doc/test_retokenize_split.py | 3 ++- spacy/tests/doc/test_token_api.py | 22 ++++++++++++++++++++++ spacy/tests/parser/test_nonproj.py | 2 +- spacy/tests/training/test_new_example.py | 19 +++++++++++++++++++ spacy/tests/training/test_training.py | 3 ++- spacy/tokens/doc.pyx | 8 +++++--- spacy/tokens/token.pyx | 22 +++++++++++++++++----- spacy/training/example.pyx | 18 ++++++++---------- 12 files changed, 83 insertions(+), 33 deletions(-) diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index e28322cb5..bc15d9b80 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -467,4 +467,3 @@ cdef enum symbol_t: IDX _ - MISSING_LABEL diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 83c693c98..b0345c710 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -466,7 +466,6 @@ IDS = { "LAW": LAW, "MORPH": MORPH, "_": _, - "MISSING_LABEL": MISSING_LABEL, } diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 92b9620ff..ef54c581c 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -98,16 +98,10 @@ def test_doc_from_array_heads_in_bounds(en_vocab): doc_from_array = Doc(en_vocab, words=words) doc_from_array.from_array(["HEAD"], arr) - # head before start is used to denote a missing value + # head before start arr = doc.to_array(["HEAD"]) arr[0] = -1 doc_from_array = Doc(en_vocab, words=words) - doc_from_array.from_array(["HEAD"], arr) - - # other negative values are invalid - arr = doc.to_array(["HEAD"]) - arr[0] = -2 - doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 60cc66d66..48cd33890 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -89,8 +89,9 @@ def test_doc_retokenize_lex_attrs(en_tokenizer): def test_doc_retokenize_spans_merge_tokens(en_tokenizer): text = "Los Angeles start." heads = [1, 2, 2, 2] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) assert len(doc) == 4 assert doc[0].head.text == "Angeles" assert doc[1].head.text == "start" @@ -145,7 +146,8 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): def test_doc_retokenize_spans_merge_heads(en_vocab): words = ["I", "found", "a", "pilates", "class", "near", "work", "."] heads = [1, 1, 4, 6, 1, 4, 5, 1] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) assert len(doc) == 8 with doc.retokenize() as retokenizer: attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"} @@ -177,8 +179,9 @@ def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer): def test_doc_retokenize_span_np_merges(en_tokenizer): text = "displaCy is a parse tool built with Javascript" heads = [1, 1, 4, 4, 1, 4, 5, 6] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) assert doc[4].head.i == 1 with doc.retokenize() as retokenizer: attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"} diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 21c3ffd4b..6bfd508bc 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -6,7 +6,8 @@ from spacy.tokens import Doc, Token def test_doc_retokenize_split(en_vocab): words = ["LosAngeles", "start", "."] heads = [1, 2, 2] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) assert len(doc) == 3 assert len(str(doc)) == 19 assert doc[0].head.text == "start" diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 3c5c063bd..4587f5601 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -4,6 +4,8 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO from spacy.symbols import VERB from spacy.vocab import Vocab from spacy.tokens import Doc +from spacy.tokens.token import MISSING_DEP_ +from spacy.training import Example @pytest.fixture @@ -250,3 +252,23 @@ def test_token_api_non_conjuncts(en_vocab): doc = Doc(en_vocab, words=words, heads=heads, deps=deps) assert [w.text for w in doc[0].conjuncts] == [] assert [w.text for w in doc[1].conjuncts] == [] + + +def test_missing_head_dep(en_vocab): + heads = [1, 1, 1, 1, 2, None] + deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None] + words = ["I", "like", "London", "and", "Berlin", "."] + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + pred_has_heads = [t.has_head() for t in doc] + pred_deps = [t.dep_ for t in doc] + assert pred_has_heads == [True, True, True, True, True, False] + assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] + example = Example.from_dict(doc, {"heads": heads, "deps": deps}) + ref_heads = [t.head.i for t in example.reference] + ref_deps = [t.dep_ for t in example.reference] + ref_has_heads = [t.has_head() for t in example.reference] + assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] + assert ref_has_heads == [True, True, True, True, True, False] + aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True) + assert aligned_heads[5] == ref_heads[5] + assert aligned_deps[5] == MISSING_DEP_ \ No newline at end of file diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 544701a4c..3957e4d77 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -121,7 +121,7 @@ def test_parser_pseudoprojectivity(en_vocab): assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"] # if there are two potential new heads, the first one is chosen even if - # it"s wrong + # it's wrong proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1] deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod||dobj", "advmod", diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index 06db86a12..01ed5b5b6 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -263,3 +263,22 @@ def test_Example_from_dict_sentences(): annots = {"sent_starts": [1, -1, 0, 0, 0]} ex = Example.from_dict(predicted, annots) assert len(list(ex.reference.sents)) == 1 + + +def test_Example_from_dict_with_parse(): + vocab = Vocab() + words = ["I", "like", "London", "and", "Berlin", "."] + deps = ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"] + heads = [1, 1, 1, 2, 2, 1] + annots_head_only = {"words": words, "heads": heads} + annots_head_dep = {"words": words, "heads": heads, "deps": deps} + predicted = Doc(vocab, words=words) + + # when not providing deps, the head information is considered to be missing + # in this case, the token's heads refer to themselves + example_1 = Example.from_dict(predicted, annots_head_only) + assert [t.head.i for t in example_1.reference] == [0, 1, 2, 3, 4, 5] + + # when providing deps, the head information is actually used + example_2 = Example.from_dict(predicted, annots_head_dep) + assert [t.head.i for t in example_2.reference] == heads diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 2e83580b5..c7a85bf87 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -436,7 +436,8 @@ def test_gold_ner_missing_tags(en_tokenizer): def test_projectivize(en_tokenizer): doc = en_tokenizer("He pretty quickly walks away") heads = [3, 2, 3, 0, 2] - example = Example.from_dict(doc, {"heads": heads}) + deps = ["dep"] * len(heads) + example = Example.from_dict(doc, {"heads": heads, "deps": deps}) proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False) assert proj_heads == [3, 2, 3, 0, 3] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 92344b6c8..fc14fb506 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -16,6 +16,7 @@ from thinc.util import copy_array import warnings from .span cimport Span +from .token import MISSING_DEP_ from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t @@ -266,7 +267,9 @@ cdef class Doc: self.push_back(lexeme, has_space) if heads is not None: - heads = [head - i if head is not None else None for i, head in enumerate(heads)] + heads = [head - i if head is not None else 0 for i, head in enumerate(heads)] + if deps is not None: + deps = [dep if dep is not None else MISSING_DEP_ for dep in deps] if deps and not heads: heads = [0] * len(deps) if sent_starts is not None: @@ -1040,8 +1043,7 @@ cdef class Doc: # cast index to signed int abs_head_index = values[col * stride + i] abs_head_index += i - # abs_head_index -1 refers to missing value - if abs_head_index < -1 or abs_head_index >= length: + if abs_head_index < 0 or abs_head_index >= length: raise ValueError( Errors.E190.format( index=i, diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index c52f7da1b..a6f9a2a0c 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -22,6 +22,8 @@ from .. import parts_of_speech from ..errors import Errors, Warnings from .underscore import Underscore, get_ext_args +MISSING_DEP_ = "" + cdef class Token: """An individual token – i.e. a word, punctuation symbol, whitespace, @@ -638,17 +640,27 @@ cdef class Token: return False return any(ancestor.i == self.i for ancestor in descendant.ancestors) + + def has_head(self): + """Check whether the token has annotated head information. + + RETURNS (bool): Whether the head annotation is valid or not. + """ + return self.dep_ != MISSING_DEP_ + + property head: """The syntactic parent, or "governor", of this token. + If token.has_head() is `False`, this method will return itself. RETURNS (Token): The token predicted by the parser to be the head of - the current token. Returns None if unknown. + the current token. """ def __get__(self): - head_i = self.i + self.c.head - if head_i == -1: - return None - return self.doc[head_i] + if not self.has_head(): + return self + else: + return self.doc[self.i + self.c.head] def __set__(self, Token new_head): # This function sets the head of self to new_head and updates the diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index fc5cd8e26..856719893 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -11,8 +11,8 @@ from .alignment import Alignment from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings -from ..symbols import MISSING_LABEL from ..pipeline._parser_internals import nonproj +from ..tokens.token import MISSING_DEP_ from ..util import logger @@ -180,18 +180,15 @@ cdef class Example: gold_to_cand = self.alignment.y2x aligned_heads = [None] * self.x.length aligned_deps = [None] * self.x.length - heads = [token.head.i if token.head is not None else -1 for token in self.y] + has_heads = [token.has_head() for token in self.y] + heads = [token.head.i for token in self.y] deps = [token.dep_ for token in self.y] if projectivize: - proj_heads, proj_deps = nonproj.projectivize(heads, deps) - # don't touch the missing data - heads = [h if heads[i] != -1 else -1 for i, h in enumerate(proj_heads)] - MISSING = self.x.vocab.strings[MISSING_LABEL] - deps = [d if deps[i] != MISSING else MISSING for i, d in enumerate(proj_deps)] + heads, deps = nonproj.projectivize(heads, deps) for cand_i in range(self.x.length): if cand_to_gold.lengths[cand_i] == 1: gold_i = cand_to_gold[cand_i].dataXd[0, 0] - if heads[gold_i] != -1 and gold_to_cand.lengths[heads[gold_i]] == 1: + if gold_to_cand.lengths[heads[gold_i]] == 1: aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0]) aligned_deps[cand_i] = deps[gold_i] return aligned_heads, aligned_deps @@ -334,10 +331,11 @@ def _annot2array(vocab, tok_annot, doc_annot): pass elif key == "HEAD": attrs.append(key) - values.append([h-i if h is not None else -(i+1) for i, h in enumerate(value)]) + values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) elif key == "DEP": attrs.append(key) - values.append([vocab.strings.add(h) if h is not None else MISSING_LABEL for h in value]) + value = [v if v is not None else MISSING_DEP_ for v in value] + values.append([vocab.strings.add(h) for h in value]) elif key == "SENT_START": attrs.append(key) values.append(value) From 5b598bd1d56c9a40748754174fbc63f66559ca2f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 12 Jan 2021 17:28:41 +0100 Subject: [PATCH 4/7] formatting --- spacy/pipeline/_parser_internals/arc_eager.pyx | 4 +++- spacy/tests/doc/test_token_api.py | 6 +++--- spacy/tests/training/test_new_example.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 463980051..50b620b7a 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -9,6 +9,7 @@ from ...typedefs cimport hash_t, attr_t from ...strings cimport hash_string from ...structs cimport TokenC from ...tokens.doc cimport Doc, set_children_from_heads +from ...tokens.token import MISSING_DEP_ from ...training.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC, ArcC @@ -195,7 +196,8 @@ cdef class ArcEagerGold: def __init__(self, ArcEager moves, StateClass stcls, Example example): self.mem = Pool() heads, labels = example.get_aligned_parse(projectivize=True) - labels = [example.x.vocab.strings.add(label) if label is not None else 0 for label in labels] + labels = [label if label is not None else MISSING_DEP_ for label in labels] + labels = [example.x.vocab.strings.add(label) for label in labels] sent_starts = example.get_aligned_sent_starts() assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts)) self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 4587f5601..d3fb044ee 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -259,8 +259,8 @@ def test_missing_head_dep(en_vocab): deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None] words = ["I", "like", "London", "and", "Berlin", "."] doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - pred_has_heads = [t.has_head() for t in doc] - pred_deps = [t.dep_ for t in doc] + pred_has_heads = [t.has_head() for t in doc] + pred_deps = [t.dep_ for t in doc] assert pred_has_heads == [True, True, True, True, True, False] assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] example = Example.from_dict(doc, {"heads": heads, "deps": deps}) @@ -271,4 +271,4 @@ def test_missing_head_dep(en_vocab): assert ref_has_heads == [True, True, True, True, True, False] aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True) assert aligned_heads[5] == ref_heads[5] - assert aligned_deps[5] == MISSING_DEP_ \ No newline at end of file + assert aligned_deps[5] == MISSING_DEP_ diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index 01ed5b5b6..6b6486b2b 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -265,7 +265,7 @@ def test_Example_from_dict_sentences(): assert len(list(ex.reference.sents)) == 1 -def test_Example_from_dict_with_parse(): +def test_Example_missing_deps(): vocab = Vocab() words = ["I", "like", "London", "and", "Berlin", "."] deps = ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"] From 232e953b148e1dd9259a34bcbc64aec1d7786e08 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 12 Jan 2021 20:32:57 +0100 Subject: [PATCH 5/7] pytest.approx with absolute eps --- spacy/tests/parser/test_parse.py | 72 ++++++++++++++++---------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 437cc760c..5b68bbc37 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -360,25 +360,25 @@ def test_beam_overfitting_IO(): head_scores = head_scores[0] label_scores = label_scores[0] # test label annotations: 0=nsubj, 2=dobj, 3=punct - assert label_scores[(0, "nsubj")] == pytest.approx(1.0, eps) - assert label_scores[(0, "dobj")] == pytest.approx(0.0, eps) - assert label_scores[(0, "punct")] == pytest.approx(0.0, eps) - assert label_scores[(2, "nsubj")] == pytest.approx(0.0, eps) - assert label_scores[(2, "dobj")] == pytest.approx(1.0, eps) - assert label_scores[(2, "punct")] == pytest.approx(0.0, eps) - assert label_scores[(3, "nsubj")] == pytest.approx(0.0, eps) - assert label_scores[(3, "dobj")] == pytest.approx(0.0, eps) - assert label_scores[(3, "punct")] == pytest.approx(1.0, eps) + assert label_scores[(0, "nsubj")] == pytest.approx(1.0, abs=eps) + assert label_scores[(0, "dobj")] == pytest.approx(0.0, abs=eps) + assert label_scores[(0, "punct")] == pytest.approx(0.0, abs=eps) + assert label_scores[(2, "nsubj")] == pytest.approx(0.0, abs=eps) + assert label_scores[(2, "dobj")] == pytest.approx(1.0, abs=eps) + assert label_scores[(2, "punct")] == pytest.approx(0.0, abs=eps) + assert label_scores[(3, "nsubj")] == pytest.approx(0.0, abs=eps) + assert label_scores[(3, "dobj")] == pytest.approx(0.0, abs=eps) + assert label_scores[(3, "punct")] == pytest.approx(1.0, abs=eps) # test head annotations: the root is token at index 1 - assert head_scores[(0, 0)] == pytest.approx(0.0, eps) - assert head_scores[(0, 1)] == pytest.approx(1.0, eps) - assert head_scores[(0, 2)] == pytest.approx(0.0, eps) - assert head_scores[(2, 0)] == pytest.approx(0.0, eps) - assert head_scores[(2, 1)] == pytest.approx(1.0, eps) - assert head_scores[(2, 2)] == pytest.approx(0.0, eps) - assert head_scores[(3, 0)] == pytest.approx(0.0, eps) - assert head_scores[(3, 1)] == pytest.approx(1.0, eps) - assert head_scores[(3, 2)] == pytest.approx(0.0, eps) + assert head_scores[(0, 0)] == pytest.approx(0.0, abs=eps) + assert head_scores[(0, 1)] == pytest.approx(1.0, abs=eps) + assert head_scores[(0, 2)] == pytest.approx(0.0, abs=eps) + assert head_scores[(2, 0)] == pytest.approx(0.0, abs=eps) + assert head_scores[(2, 1)] == pytest.approx(1.0, abs=eps) + assert head_scores[(2, 2)] == pytest.approx(0.0, abs=eps) + assert head_scores[(3, 0)] == pytest.approx(0.0, abs=eps) + assert head_scores[(3, 1)] == pytest.approx(1.0, abs=eps) + assert head_scores[(3, 2)] == pytest.approx(0.0, abs=eps) # Also test the results are still the same after IO with make_tempdir() as tmp_dir: @@ -392,21 +392,21 @@ def test_beam_overfitting_IO(): head_scores2 = head_scores2[0] label_scores2 = label_scores2[0] # check the results again - assert label_scores2[(0, "nsubj")] == pytest.approx(1.0, eps) - assert label_scores2[(0, "dobj")] == pytest.approx(0.0, eps) - assert label_scores2[(0, "punct")] == pytest.approx(0.0, eps) - assert label_scores2[(2, "nsubj")] == pytest.approx(0.0, eps) - assert label_scores2[(2, "dobj")] == pytest.approx(1.0, eps) - assert label_scores2[(2, "punct")] == pytest.approx(0.0, eps) - assert label_scores2[(3, "nsubj")] == pytest.approx(0.0, eps) - assert label_scores2[(3, "dobj")] == pytest.approx(0.0, eps) - assert label_scores2[(3, "punct")] == pytest.approx(1.0, eps) - assert head_scores2[(0, 0)] == pytest.approx(0.0, eps) - assert head_scores2[(0, 1)] == pytest.approx(1.0, eps) - assert head_scores2[(0, 2)] == pytest.approx(0.0, eps) - assert head_scores2[(2, 0)] == pytest.approx(0.0, eps) - assert head_scores2[(2, 1)] == pytest.approx(1.0, eps) - assert head_scores2[(2, 2)] == pytest.approx(0.0, eps) - assert head_scores2[(3, 0)] == pytest.approx(0.0, eps) - assert head_scores2[(3, 1)] == pytest.approx(1.0, eps) - assert head_scores2[(3, 2)] == pytest.approx(0.0, eps) + assert label_scores2[(0, "nsubj")] == pytest.approx(1.0, abs=eps) + assert label_scores2[(0, "dobj")] == pytest.approx(0.0, abs=eps) + assert label_scores2[(0, "punct")] == pytest.approx(0.0, abs=eps) + assert label_scores2[(2, "nsubj")] == pytest.approx(0.0, abs=eps) + assert label_scores2[(2, "dobj")] == pytest.approx(1.0, abs=eps) + assert label_scores2[(2, "punct")] == pytest.approx(0.0, abs=eps) + assert label_scores2[(3, "nsubj")] == pytest.approx(0.0, abs=eps) + assert label_scores2[(3, "dobj")] == pytest.approx(0.0, abs=eps) + assert label_scores2[(3, "punct")] == pytest.approx(1.0, abs=eps) + assert head_scores2[(0, 0)] == pytest.approx(0.0, abs=eps) + assert head_scores2[(0, 1)] == pytest.approx(1.0, abs=eps) + assert head_scores2[(0, 2)] == pytest.approx(0.0, abs=eps) + assert head_scores2[(2, 0)] == pytest.approx(0.0, abs=eps) + assert head_scores2[(2, 1)] == pytest.approx(1.0, abs=eps) + assert head_scores2[(2, 2)] == pytest.approx(0.0, abs=eps) + assert head_scores2[(3, 0)] == pytest.approx(0.0, abs=eps) + assert head_scores2[(3, 1)] == pytest.approx(1.0, abs=eps) + assert head_scores2[(3, 2)] == pytest.approx(0.0, abs=eps) From 86a4e316b873a2752d06f3bc222a97074f69e716 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 13 Jan 2021 13:47:25 +0100 Subject: [PATCH 6/7] fix sent_starts --- spacy/tests/doc/test_token_api.py | 4 ++++ spacy/tests/training/test_new_example.py | 21 +++++++++++++++++++++ spacy/tokens/doc.pyx | 2 +- spacy/tokens/token.pxd | 7 +++++++ spacy/training/example.pyx | 5 ++++- 5 files changed, 37 insertions(+), 2 deletions(-) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index d3fb044ee..0795080a5 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -261,14 +261,18 @@ def test_missing_head_dep(en_vocab): doc = Doc(en_vocab, words=words, heads=heads, deps=deps) pred_has_heads = [t.has_head() for t in doc] pred_deps = [t.dep_ for t in doc] + pred_sent_starts = [t.is_sent_start for t in doc] assert pred_has_heads == [True, True, True, True, True, False] assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] + assert pred_sent_starts == [True, False, False, False, False, False] example = Example.from_dict(doc, {"heads": heads, "deps": deps}) ref_heads = [t.head.i for t in example.reference] ref_deps = [t.dep_ for t in example.reference] ref_has_heads = [t.has_head() for t in example.reference] + ref_sent_starts = [t.is_sent_start for t in example.reference] assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] assert ref_has_heads == [True, True, True, True, True, False] + assert ref_sent_starts == [True, False, False, False, False, False] aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True) assert aligned_heads[5] == ref_heads[5] assert aligned_deps[5] == MISSING_DEP_ diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index 6b6486b2b..0a3184071 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -282,3 +282,24 @@ def test_Example_missing_deps(): # when providing deps, the head information is actually used example_2 = Example.from_dict(predicted, annots_head_dep) assert [t.head.i for t in example_2.reference] == heads + + +def test_Example_missing_heads(): + vocab = Vocab() + words = ["I", "like", "London", "and", "Berlin", "."] + deps = ["nsubj", "ROOT", "dobj", None, "conj", "punct"] + heads = [1, 1, 1, None, 2, 1] + annots = {"words": words, "heads": heads, "deps": deps} + predicted = Doc(vocab, words=words) + + example = Example.from_dict(predicted, annots) + parsed_heads = [t.head.i for t in example.reference] + assert parsed_heads[0] == heads[0] + assert parsed_heads[1] == heads[1] + assert parsed_heads[2] == heads[2] + assert parsed_heads[4] == heads[4] + assert parsed_heads[5] == heads[5] + assert [t.has_head() for t in example.reference] == [True, True, True, False, True, True] + + # Ensure that the missing head doesn't create an artificial new sentence start + assert example.get_aligned_sent_starts() == [True, False, False, False, False, False] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index fc14fb506..221e78b2e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1540,7 +1540,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1: for i in range(start, end): tokens[i].sent_start = -1 for i in range(start, end): - if tokens[i].head == 0: + if tokens[i].head == 0 and not Token.missing_head(&tokens[i]): tokens[tokens[i].l_edge].sent_start = 1 diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index 45c906a82..9006c874c 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -94,3 +94,10 @@ cdef class Token: token.ent_kb_id = value elif feat_name == SENT_START: token.sent_start = value + + @staticmethod + cdef inline int missing_head(const TokenC* token) nogil: + if token.dep == 0: + return 1 + else: + return 0 diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 856719893..3303a8456 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -184,7 +184,10 @@ cdef class Example: heads = [token.head.i for token in self.y] deps = [token.dep_ for token in self.y] if projectivize: - heads, deps = nonproj.projectivize(heads, deps) + proj_heads, proj_deps = nonproj.projectivize(heads, deps) + # ensure that data that was previously missing, remains missing + heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)] + deps = [d if deps[i] != MISSING_DEP_ else MISSING_DEP_ for i, d in enumerate(proj_deps)] for cand_i in range(self.x.length): if cand_to_gold.lengths[cand_i] == 1: gold_i = cand_to_gold[cand_i].dataXd[0, 0] From ed53bb979d2b8e62577ff3d45140ca6a9a6bb1c5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 13 Jan 2021 14:20:05 +0100 Subject: [PATCH 7/7] cleanup --- .../pipeline/_parser_internals/arc_eager.pyx | 5 ++-- spacy/tests/doc/test_token_api.py | 30 ++++++++++++------- spacy/tests/parser/test_parse.py | 2 +- spacy/tokens/doc.pyx | 3 +- spacy/tokens/token.pxd | 12 +++++--- spacy/tokens/token.pyx | 15 ++++++---- spacy/training/example.pyx | 10 +++---- 7 files changed, 48 insertions(+), 29 deletions(-) diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 50b620b7a..f50f91f21 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -9,7 +9,7 @@ from ...typedefs cimport hash_t, attr_t from ...strings cimport hash_string from ...structs cimport TokenC from ...tokens.doc cimport Doc, set_children_from_heads -from ...tokens.token import MISSING_DEP_ +from ...tokens.token cimport MISSING_DEP from ...training.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC, ArcC @@ -196,8 +196,7 @@ cdef class ArcEagerGold: def __init__(self, ArcEager moves, StateClass stcls, Example example): self.mem = Pool() heads, labels = example.get_aligned_parse(projectivize=True) - labels = [label if label is not None else MISSING_DEP_ for label in labels] - labels = [example.x.vocab.strings.add(label) for label in labels] + labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels] sent_starts = example.get_aligned_sent_starts() assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts)) self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 0795080a5..dda28809d 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -4,7 +4,6 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO from spacy.symbols import VERB from spacy.vocab import Vocab from spacy.tokens import Doc -from spacy.tokens.token import MISSING_DEP_ from spacy.training import Example @@ -255,24 +254,35 @@ def test_token_api_non_conjuncts(en_vocab): def test_missing_head_dep(en_vocab): - heads = [1, 1, 1, 1, 2, None] - deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None] + """ Check that the Doc constructor and Example.from_dict parse missing information the same""" + heads = [1, 1, 1, 1, 2, None] # element 5 is missing + deps = ["", "ROOT", "dobj", "cc", "conj", None] # element 0 and 5 are missing words = ["I", "like", "London", "and", "Berlin", "."] doc = Doc(en_vocab, words=words, heads=heads, deps=deps) pred_has_heads = [t.has_head() for t in doc] + pred_has_deps = [t.has_dep() for t in doc] + pred_heads = [t.head.i for t in doc] pred_deps = [t.dep_ for t in doc] pred_sent_starts = [t.is_sent_start for t in doc] - assert pred_has_heads == [True, True, True, True, True, False] - assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] + assert pred_has_heads == [False, True, True, True, True, False] + assert pred_has_deps == [False, True, True, True, True, False] + assert pred_heads[1:5] == [1, 1, 1, 2] + assert pred_deps[1:5] == ["ROOT", "dobj", "cc", "conj"] assert pred_sent_starts == [True, False, False, False, False, False] example = Example.from_dict(doc, {"heads": heads, "deps": deps}) + ref_has_heads = [t.has_head() for t in example.reference] + ref_has_deps = [t.has_dep() for t in example.reference] ref_heads = [t.head.i for t in example.reference] ref_deps = [t.dep_ for t in example.reference] - ref_has_heads = [t.has_head() for t in example.reference] ref_sent_starts = [t.is_sent_start for t in example.reference] - assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] - assert ref_has_heads == [True, True, True, True, True, False] - assert ref_sent_starts == [True, False, False, False, False, False] + assert ref_has_heads == pred_has_heads + assert ref_has_deps == pred_has_heads + assert ref_heads == pred_heads + assert ref_deps == pred_deps + assert ref_sent_starts == pred_sent_starts + # check that the aligned parse preserves the missing information aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True) + assert aligned_deps[0] == ref_deps[0] + assert aligned_heads[0] == ref_heads[0] + assert aligned_deps[5] == ref_deps[5] assert aligned_heads[5] == ref_heads[5] - assert aligned_deps[5] == MISSING_DEP_ diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 5b68bbc37..dc878dd7a 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -253,7 +253,7 @@ def test_overfitting_IO(pipe_name): parser.add_label(dep) optimizer = nlp.initialize() # run overfitting - for i in range(150): + for i in range(200): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses[pipe_name] < 0.0001 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 221e78b2e..456d0a732 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -16,7 +16,7 @@ from thinc.util import copy_array import warnings from .span cimport Span -from .token import MISSING_DEP_ +from .token cimport MISSING_DEP from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t @@ -269,6 +269,7 @@ cdef class Doc: if heads is not None: heads = [head - i if head is not None else 0 for i, head in enumerate(heads)] if deps is not None: + MISSING_DEP_ = self.vocab.strings[MISSING_DEP] deps = [dep if dep is not None else MISSING_DEP_ for dep in deps] if deps and not heads: heads = [0] * len(deps) diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index 9006c874c..58b727764 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -9,6 +9,7 @@ from ..lexeme cimport Lexeme from ..errors import Errors +cdef int MISSING_DEP = 0 cdef class Token: cdef readonly Vocab vocab @@ -95,9 +96,12 @@ cdef class Token: elif feat_name == SENT_START: token.sent_start = value + + @staticmethod + cdef inline int missing_dep(const TokenC* token) nogil: + return token.dep == MISSING_DEP + + @staticmethod cdef inline int missing_head(const TokenC* token) nogil: - if token.dep == 0: - return 1 - else: - return 0 + return Token.missing_dep(token) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a6f9a2a0c..27aa30199 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -22,8 +22,6 @@ from .. import parts_of_speech from ..errors import Errors, Warnings from .underscore import Underscore, get_ext_args -MISSING_DEP_ = "" - cdef class Token: """An individual token – i.e. a word, punctuation symbol, whitespace, @@ -640,14 +638,13 @@ cdef class Token: return False return any(ancestor.i == self.i for ancestor in descendant.ancestors) - def has_head(self): """Check whether the token has annotated head information. + Return False when the head annotation is unset/missing. RETURNS (bool): Whether the head annotation is valid or not. """ - return self.dep_ != MISSING_DEP_ - + return not Token.missing_head(self.c) property head: """The syntactic parent, or "governor", of this token. @@ -873,6 +870,14 @@ cdef class Token: def __set__(self, tag): self.tag = self.vocab.strings.add(tag) + def has_dep(self): + """Check whether the token has annotated dep information. + Returns False when the dep label is unset/missing. + + RETURNS (bool): Whether the dep label is valid or not. + """ + return not Token.missing_dep(self.c) + property dep_: """RETURNS (str): The syntactic dependency label.""" def __get__(self): diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 3303a8456..fe4ee6ff4 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -12,7 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings from ..pipeline._parser_internals import nonproj -from ..tokens.token import MISSING_DEP_ +from ..tokens.token cimport MISSING_DEP from ..util import logger @@ -180,14 +180,15 @@ cdef class Example: gold_to_cand = self.alignment.y2x aligned_heads = [None] * self.x.length aligned_deps = [None] * self.x.length + has_deps = [token.has_dep() for token in self.y] has_heads = [token.has_head() for token in self.y] heads = [token.head.i for token in self.y] deps = [token.dep_ for token in self.y] if projectivize: proj_heads, proj_deps = nonproj.projectivize(heads, deps) - # ensure that data that was previously missing, remains missing + # ensure that missing data remains missing heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)] - deps = [d if deps[i] != MISSING_DEP_ else MISSING_DEP_ for i, d in enumerate(proj_deps)] + deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)] for cand_i in range(self.x.length): if cand_to_gold.lengths[cand_i] == 1: gold_i = cand_to_gold[cand_i].dataXd[0, 0] @@ -337,8 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot): values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) elif key == "DEP": attrs.append(key) - value = [v if v is not None else MISSING_DEP_ for v in value] - values.append([vocab.strings.add(h) for h in value]) + values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) elif key == "SENT_START": attrs.append(key) values.append(value)