mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Merge pull request #6691 from svlandeg/feature/missing-dep
This commit is contained in:
commit
8ba5d88b4b
|
@ -12,6 +12,7 @@ from ..training import Example
|
||||||
from ..training.initialize import get_sourced_components
|
from ..training.initialize import get_sourced_components
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
|
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -383,7 +384,7 @@ def debug_data(
|
||||||
# rare labels in projectivized train
|
# rare labels in projectivized train
|
||||||
rare_projectivized_labels = []
|
rare_projectivized_labels = []
|
||||||
for label in gold_train_data["deps"]:
|
for label in gold_train_data["deps"]:
|
||||||
if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
|
if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and DELIMITER in label:
|
||||||
rare_projectivized_labels.append(
|
rare_projectivized_labels.append(
|
||||||
f"{label}: {gold_train_data['deps'][label]}"
|
f"{label}: {gold_train_data['deps'][label]}"
|
||||||
)
|
)
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ...typedefs cimport hash_t, attr_t
|
||||||
from ...strings cimport hash_string
|
from ...strings cimport hash_string
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||||
|
from ...tokens.token cimport MISSING_DEP
|
||||||
from ...training.example cimport Example
|
from ...training.example cimport Example
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC, ArcC
|
from ._state cimport StateC, ArcC
|
||||||
|
@ -195,8 +196,7 @@ cdef class ArcEagerGold:
|
||||||
def __init__(self, ArcEager moves, StateClass stcls, Example example):
|
def __init__(self, ArcEager moves, StateClass stcls, Example example):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
heads, labels = example.get_aligned_parse(projectivize=True)
|
heads, labels = example.get_aligned_parse(projectivize=True)
|
||||||
labels = [label if label is not None else "" for label in labels]
|
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
|
||||||
labels = [example.x.vocab.strings.add(label) for label in labels]
|
|
||||||
sent_starts = example.get_aligned_sent_starts()
|
sent_starts = example.get_aligned_sent_starts()
|
||||||
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
||||||
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
||||||
|
@ -783,7 +783,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
print(self.get_class_name(i), is_valid[i], costs[i])
|
print(self.get_class_name(i), is_valid[i], costs[i])
|
||||||
print("Gold sent starts?", is_sent_start(&gold_state, state.B(0)), is_sent_start(&gold_state, state.B(1)))
|
print("Gold sent starts?", is_sent_start(&gold_state, state.B(0)), is_sent_start(&gold_state, state.B(1)))
|
||||||
raise ValueError
|
raise ValueError("Could not find gold transition - see logs above.")
|
||||||
|
|
||||||
def get_oracle_sequence_from_state(self, StateClass state, ArcEagerGold gold, _debug=None):
|
def get_oracle_sequence_from_state(self, StateClass state, ArcEagerGold gold, _debug=None):
|
||||||
cdef int i
|
cdef int i
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ._parser_internals.arc_eager cimport ArcEager
|
||||||
from .functions import merge_subtokens
|
from .functions import merge_subtokens
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ._parser_internals import nonproj
|
from ._parser_internals import nonproj
|
||||||
|
from ._parser_internals.nonproj import DELIMITER
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples
|
||||||
|
|
||||||
|
@ -230,8 +231,8 @@ cdef class DependencyParser(Parser):
|
||||||
for move in self.move_names:
|
for move in self.move_names:
|
||||||
if "-" in move:
|
if "-" in move:
|
||||||
label = move.split("-")[1]
|
label = move.split("-")[1]
|
||||||
if "||" in label:
|
if DELIMITER in label:
|
||||||
label = label.split("||")[1]
|
label = label.split(DELIMITER)[1]
|
||||||
labels.add(label)
|
labels.add(label)
|
||||||
return tuple(sorted(labels))
|
return tuple(sorted(labels))
|
||||||
|
|
||||||
|
|
|
@ -89,8 +89,9 @@ def test_doc_retokenize_lex_attrs(en_tokenizer):
|
||||||
def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
|
def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
|
||||||
text = "Los Angeles start."
|
text = "Los Angeles start."
|
||||||
heads = [1, 2, 2, 2]
|
heads = [1, 2, 2, 2]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].head.text == "Angeles"
|
assert doc[0].head.text == "Angeles"
|
||||||
assert doc[1].head.text == "start"
|
assert doc[1].head.text == "start"
|
||||||
|
@ -145,7 +146,8 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
|
||||||
def test_doc_retokenize_spans_merge_heads(en_vocab):
|
def test_doc_retokenize_spans_merge_heads(en_vocab):
|
||||||
words = ["I", "found", "a", "pilates", "class", "near", "work", "."]
|
words = ["I", "found", "a", "pilates", "class", "near", "work", "."]
|
||||||
heads = [1, 1, 4, 6, 1, 4, 5, 1]
|
heads = [1, 1, 4, 6, 1, 4, 5, 1]
|
||||||
doc = Doc(en_vocab, words=words, heads=heads)
|
deps = ["dep"] * len(heads)
|
||||||
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
assert len(doc) == 8
|
assert len(doc) == 8
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"}
|
attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"}
|
||||||
|
@ -177,8 +179,9 @@ def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
|
||||||
def test_doc_retokenize_span_np_merges(en_tokenizer):
|
def test_doc_retokenize_span_np_merges(en_tokenizer):
|
||||||
text = "displaCy is a parse tool built with Javascript"
|
text = "displaCy is a parse tool built with Javascript"
|
||||||
heads = [1, 1, 4, 4, 1, 4, 5, 6]
|
heads = [1, 1, 4, 4, 1, 4, 5, 6]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
assert doc[4].head.i == 1
|
assert doc[4].head.i == 1
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
|
attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
|
||||||
|
|
|
@ -6,7 +6,8 @@ from spacy.tokens import Doc, Token
|
||||||
def test_doc_retokenize_split(en_vocab):
|
def test_doc_retokenize_split(en_vocab):
|
||||||
words = ["LosAngeles", "start", "."]
|
words = ["LosAngeles", "start", "."]
|
||||||
heads = [1, 2, 2]
|
heads = [1, 2, 2]
|
||||||
doc = Doc(en_vocab, words=words, heads=heads)
|
deps = ["dep"] * len(heads)
|
||||||
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
assert len(doc) == 3
|
assert len(doc) == 3
|
||||||
assert len(str(doc)) == 19
|
assert len(str(doc)) == 19
|
||||||
assert doc[0].head.text == "start"
|
assert doc[0].head.text == "start"
|
||||||
|
|
|
@ -4,6 +4,7 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO
|
||||||
from spacy.symbols import VERB
|
from spacy.symbols import VERB
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
from spacy.training import Example
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -250,3 +251,38 @@ def test_token_api_non_conjuncts(en_vocab):
|
||||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
assert [w.text for w in doc[0].conjuncts] == []
|
assert [w.text for w in doc[0].conjuncts] == []
|
||||||
assert [w.text for w in doc[1].conjuncts] == []
|
assert [w.text for w in doc[1].conjuncts] == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_head_dep(en_vocab):
|
||||||
|
""" Check that the Doc constructor and Example.from_dict parse missing information the same"""
|
||||||
|
heads = [1, 1, 1, 1, 2, None] # element 5 is missing
|
||||||
|
deps = ["", "ROOT", "dobj", "cc", "conj", None] # element 0 and 5 are missing
|
||||||
|
words = ["I", "like", "London", "and", "Berlin", "."]
|
||||||
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
|
pred_has_heads = [t.has_head() for t in doc]
|
||||||
|
pred_has_deps = [t.has_dep() for t in doc]
|
||||||
|
pred_heads = [t.head.i for t in doc]
|
||||||
|
pred_deps = [t.dep_ for t in doc]
|
||||||
|
pred_sent_starts = [t.is_sent_start for t in doc]
|
||||||
|
assert pred_has_heads == [False, True, True, True, True, False]
|
||||||
|
assert pred_has_deps == [False, True, True, True, True, False]
|
||||||
|
assert pred_heads[1:5] == [1, 1, 1, 2]
|
||||||
|
assert pred_deps[1:5] == ["ROOT", "dobj", "cc", "conj"]
|
||||||
|
assert pred_sent_starts == [True, False, False, False, False, False]
|
||||||
|
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
|
||||||
|
ref_has_heads = [t.has_head() for t in example.reference]
|
||||||
|
ref_has_deps = [t.has_dep() for t in example.reference]
|
||||||
|
ref_heads = [t.head.i for t in example.reference]
|
||||||
|
ref_deps = [t.dep_ for t in example.reference]
|
||||||
|
ref_sent_starts = [t.is_sent_start for t in example.reference]
|
||||||
|
assert ref_has_heads == pred_has_heads
|
||||||
|
assert ref_has_deps == pred_has_heads
|
||||||
|
assert ref_heads == pred_heads
|
||||||
|
assert ref_deps == pred_deps
|
||||||
|
assert ref_sent_starts == pred_sent_starts
|
||||||
|
# check that the aligned parse preserves the missing information
|
||||||
|
aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True)
|
||||||
|
assert aligned_deps[0] == ref_deps[0]
|
||||||
|
assert aligned_heads[0] == ref_heads[0]
|
||||||
|
assert aligned_deps[5] == ref_deps[5]
|
||||||
|
assert aligned_heads[5] == ref_heads[5]
|
||||||
|
|
|
@ -121,7 +121,7 @@ def test_parser_pseudoprojectivity(en_vocab):
|
||||||
assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
||||||
"nsubj", "acl", "punct"]
|
"nsubj", "acl", "punct"]
|
||||||
# if there are two potential new heads, the first one is chosen even if
|
# if there are two potential new heads, the first one is chosen even if
|
||||||
# it"s wrong
|
# it's wrong
|
||||||
proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
||||||
deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det",
|
deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det",
|
||||||
"dobj", "det", "nmod", "aux", "nmod||dobj", "advmod",
|
"dobj", "det", "nmod", "aux", "nmod||dobj", "advmod",
|
||||||
|
|
|
@ -45,7 +45,17 @@ CONFLICTING_DATA = [
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
eps = 0.01
|
PARTIAL_DATA = [
|
||||||
|
(
|
||||||
|
"I like London.",
|
||||||
|
{
|
||||||
|
"heads": [1, 1, 1, None],
|
||||||
|
"deps": ["nsubj", "ROOT", "dobj", None],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
eps = 0.1
|
||||||
|
|
||||||
|
|
||||||
def test_parser_root(en_vocab):
|
def test_parser_root(en_vocab):
|
||||||
|
@ -205,6 +215,32 @@ def test_parser_set_sent_starts(en_vocab):
|
||||||
assert token.head in sent
|
assert token.head in sent
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||||
|
def test_incomplete_data(pipe_name):
|
||||||
|
# Test that the parser works with incomplete information
|
||||||
|
nlp = English()
|
||||||
|
parser = nlp.add_pipe(pipe_name)
|
||||||
|
train_examples = []
|
||||||
|
for text, annotations in PARTIAL_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
for dep in annotations.get("deps", []):
|
||||||
|
if dep is not None:
|
||||||
|
parser.add_label(dep)
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
for i in range(150):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses[pipe_name] < 0.0001
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "I like securities."
|
||||||
|
doc = nlp(test_text)
|
||||||
|
assert doc[0].dep_ == "nsubj"
|
||||||
|
assert doc[2].dep_ == "dobj"
|
||||||
|
assert doc[0].head.i == 1
|
||||||
|
assert doc[2].head.i == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||||
def test_overfitting_IO(pipe_name):
|
def test_overfitting_IO(pipe_name):
|
||||||
# Simple test to try and quickly overfit the dependency parser (normal or beam)
|
# Simple test to try and quickly overfit the dependency parser (normal or beam)
|
||||||
|
@ -217,7 +253,7 @@ def test_overfitting_IO(pipe_name):
|
||||||
parser.add_label(dep)
|
parser.add_label(dep)
|
||||||
optimizer = nlp.initialize()
|
optimizer = nlp.initialize()
|
||||||
# run overfitting
|
# run overfitting
|
||||||
for i in range(150):
|
for i in range(200):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
assert losses[pipe_name] < 0.0001
|
assert losses[pipe_name] < 0.0001
|
||||||
|
@ -324,25 +360,25 @@ def test_beam_overfitting_IO():
|
||||||
head_scores = head_scores[0]
|
head_scores = head_scores[0]
|
||||||
label_scores = label_scores[0]
|
label_scores = label_scores[0]
|
||||||
# test label annotations: 0=nsubj, 2=dobj, 3=punct
|
# test label annotations: 0=nsubj, 2=dobj, 3=punct
|
||||||
assert label_scores[(0, "nsubj")] == pytest.approx(1.0, eps)
|
assert label_scores[(0, "nsubj")] == pytest.approx(1.0, abs=eps)
|
||||||
assert label_scores[(0, "dobj")] == pytest.approx(0.0, eps)
|
assert label_scores[(0, "dobj")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores[(0, "punct")] == pytest.approx(0.0, eps)
|
assert label_scores[(0, "punct")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores[(2, "nsubj")] == pytest.approx(0.0, eps)
|
assert label_scores[(2, "nsubj")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores[(2, "dobj")] == pytest.approx(1.0, eps)
|
assert label_scores[(2, "dobj")] == pytest.approx(1.0, abs=eps)
|
||||||
assert label_scores[(2, "punct")] == pytest.approx(0.0, eps)
|
assert label_scores[(2, "punct")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores[(3, "nsubj")] == pytest.approx(0.0, eps)
|
assert label_scores[(3, "nsubj")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores[(3, "dobj")] == pytest.approx(0.0, eps)
|
assert label_scores[(3, "dobj")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores[(3, "punct")] == pytest.approx(1.0, eps)
|
assert label_scores[(3, "punct")] == pytest.approx(1.0, abs=eps)
|
||||||
# test head annotations: the root is token at index 1
|
# test head annotations: the root is token at index 1
|
||||||
assert head_scores[(0, 0)] == pytest.approx(0.0, eps)
|
assert head_scores[(0, 0)] == pytest.approx(0.0, abs=eps)
|
||||||
assert head_scores[(0, 1)] == pytest.approx(1.0, eps)
|
assert head_scores[(0, 1)] == pytest.approx(1.0, abs=eps)
|
||||||
assert head_scores[(0, 2)] == pytest.approx(0.0, eps)
|
assert head_scores[(0, 2)] == pytest.approx(0.0, abs=eps)
|
||||||
assert head_scores[(2, 0)] == pytest.approx(0.0, eps)
|
assert head_scores[(2, 0)] == pytest.approx(0.0, abs=eps)
|
||||||
assert head_scores[(2, 1)] == pytest.approx(1.0, eps)
|
assert head_scores[(2, 1)] == pytest.approx(1.0, abs=eps)
|
||||||
assert head_scores[(2, 2)] == pytest.approx(0.0, eps)
|
assert head_scores[(2, 2)] == pytest.approx(0.0, abs=eps)
|
||||||
assert head_scores[(3, 0)] == pytest.approx(0.0, eps)
|
assert head_scores[(3, 0)] == pytest.approx(0.0, abs=eps)
|
||||||
assert head_scores[(3, 1)] == pytest.approx(1.0, eps)
|
assert head_scores[(3, 1)] == pytest.approx(1.0, abs=eps)
|
||||||
assert head_scores[(3, 2)] == pytest.approx(0.0, eps)
|
assert head_scores[(3, 2)] == pytest.approx(0.0, abs=eps)
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
|
@ -356,21 +392,21 @@ def test_beam_overfitting_IO():
|
||||||
head_scores2 = head_scores2[0]
|
head_scores2 = head_scores2[0]
|
||||||
label_scores2 = label_scores2[0]
|
label_scores2 = label_scores2[0]
|
||||||
# check the results again
|
# check the results again
|
||||||
assert label_scores2[(0, "nsubj")] == pytest.approx(1.0, eps)
|
assert label_scores2[(0, "nsubj")] == pytest.approx(1.0, abs=eps)
|
||||||
assert label_scores2[(0, "dobj")] == pytest.approx(0.0, eps)
|
assert label_scores2[(0, "dobj")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores2[(0, "punct")] == pytest.approx(0.0, eps)
|
assert label_scores2[(0, "punct")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores2[(2, "nsubj")] == pytest.approx(0.0, eps)
|
assert label_scores2[(2, "nsubj")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores2[(2, "dobj")] == pytest.approx(1.0, eps)
|
assert label_scores2[(2, "dobj")] == pytest.approx(1.0, abs=eps)
|
||||||
assert label_scores2[(2, "punct")] == pytest.approx(0.0, eps)
|
assert label_scores2[(2, "punct")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores2[(3, "nsubj")] == pytest.approx(0.0, eps)
|
assert label_scores2[(3, "nsubj")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores2[(3, "dobj")] == pytest.approx(0.0, eps)
|
assert label_scores2[(3, "dobj")] == pytest.approx(0.0, abs=eps)
|
||||||
assert label_scores2[(3, "punct")] == pytest.approx(1.0, eps)
|
assert label_scores2[(3, "punct")] == pytest.approx(1.0, abs=eps)
|
||||||
assert head_scores2[(0, 0)] == pytest.approx(0.0, eps)
|
assert head_scores2[(0, 0)] == pytest.approx(0.0, abs=eps)
|
||||||
assert head_scores2[(0, 1)] == pytest.approx(1.0, eps)
|
assert head_scores2[(0, 1)] == pytest.approx(1.0, abs=eps)
|
||||||
assert head_scores2[(0, 2)] == pytest.approx(0.0, eps)
|
assert head_scores2[(0, 2)] == pytest.approx(0.0, abs=eps)
|
||||||
assert head_scores2[(2, 0)] == pytest.approx(0.0, eps)
|
assert head_scores2[(2, 0)] == pytest.approx(0.0, abs=eps)
|
||||||
assert head_scores2[(2, 1)] == pytest.approx(1.0, eps)
|
assert head_scores2[(2, 1)] == pytest.approx(1.0, abs=eps)
|
||||||
assert head_scores2[(2, 2)] == pytest.approx(0.0, eps)
|
assert head_scores2[(2, 2)] == pytest.approx(0.0, abs=eps)
|
||||||
assert head_scores2[(3, 0)] == pytest.approx(0.0, eps)
|
assert head_scores2[(3, 0)] == pytest.approx(0.0, abs=eps)
|
||||||
assert head_scores2[(3, 1)] == pytest.approx(1.0, eps)
|
assert head_scores2[(3, 1)] == pytest.approx(1.0, abs=eps)
|
||||||
assert head_scores2[(3, 2)] == pytest.approx(0.0, eps)
|
assert head_scores2[(3, 2)] == pytest.approx(0.0, abs=eps)
|
||||||
|
|
|
@ -263,3 +263,43 @@ def test_Example_from_dict_sentences():
|
||||||
annots = {"sent_starts": [1, -1, 0, 0, 0]}
|
annots = {"sent_starts": [1, -1, 0, 0, 0]}
|
||||||
ex = Example.from_dict(predicted, annots)
|
ex = Example.from_dict(predicted, annots)
|
||||||
assert len(list(ex.reference.sents)) == 1
|
assert len(list(ex.reference.sents)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_Example_missing_deps():
|
||||||
|
vocab = Vocab()
|
||||||
|
words = ["I", "like", "London", "and", "Berlin", "."]
|
||||||
|
deps = ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"]
|
||||||
|
heads = [1, 1, 1, 2, 2, 1]
|
||||||
|
annots_head_only = {"words": words, "heads": heads}
|
||||||
|
annots_head_dep = {"words": words, "heads": heads, "deps": deps}
|
||||||
|
predicted = Doc(vocab, words=words)
|
||||||
|
|
||||||
|
# when not providing deps, the head information is considered to be missing
|
||||||
|
# in this case, the token's heads refer to themselves
|
||||||
|
example_1 = Example.from_dict(predicted, annots_head_only)
|
||||||
|
assert [t.head.i for t in example_1.reference] == [0, 1, 2, 3, 4, 5]
|
||||||
|
|
||||||
|
# when providing deps, the head information is actually used
|
||||||
|
example_2 = Example.from_dict(predicted, annots_head_dep)
|
||||||
|
assert [t.head.i for t in example_2.reference] == heads
|
||||||
|
|
||||||
|
|
||||||
|
def test_Example_missing_heads():
|
||||||
|
vocab = Vocab()
|
||||||
|
words = ["I", "like", "London", "and", "Berlin", "."]
|
||||||
|
deps = ["nsubj", "ROOT", "dobj", None, "conj", "punct"]
|
||||||
|
heads = [1, 1, 1, None, 2, 1]
|
||||||
|
annots = {"words": words, "heads": heads, "deps": deps}
|
||||||
|
predicted = Doc(vocab, words=words)
|
||||||
|
|
||||||
|
example = Example.from_dict(predicted, annots)
|
||||||
|
parsed_heads = [t.head.i for t in example.reference]
|
||||||
|
assert parsed_heads[0] == heads[0]
|
||||||
|
assert parsed_heads[1] == heads[1]
|
||||||
|
assert parsed_heads[2] == heads[2]
|
||||||
|
assert parsed_heads[4] == heads[4]
|
||||||
|
assert parsed_heads[5] == heads[5]
|
||||||
|
assert [t.has_head() for t in example.reference] == [True, True, True, False, True, True]
|
||||||
|
|
||||||
|
# Ensure that the missing head doesn't create an artificial new sentence start
|
||||||
|
assert example.get_aligned_sent_starts() == [True, False, False, False, False, False]
|
||||||
|
|
|
@ -436,7 +436,8 @@ def test_gold_ner_missing_tags(en_tokenizer):
|
||||||
def test_projectivize(en_tokenizer):
|
def test_projectivize(en_tokenizer):
|
||||||
doc = en_tokenizer("He pretty quickly walks away")
|
doc = en_tokenizer("He pretty quickly walks away")
|
||||||
heads = [3, 2, 3, 0, 2]
|
heads = [3, 2, 3, 0, 2]
|
||||||
example = Example.from_dict(doc, {"heads": heads})
|
deps = ["dep"] * len(heads)
|
||||||
|
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
|
||||||
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||||
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
|
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
|
||||||
assert proj_heads == [3, 2, 3, 0, 3]
|
assert proj_heads == [3, 2, 3, 0, 3]
|
||||||
|
|
|
@ -16,6 +16,7 @@ from thinc.util import copy_array
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
|
from .token cimport MISSING_DEP
|
||||||
from ._dict_proxies import SpanGroups
|
from ._dict_proxies import SpanGroups
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
|
@ -268,7 +269,10 @@ cdef class Doc:
|
||||||
self.push_back(lexeme, has_space)
|
self.push_back(lexeme, has_space)
|
||||||
|
|
||||||
if heads is not None:
|
if heads is not None:
|
||||||
heads = [head - i for i, head in enumerate(heads)]
|
heads = [head - i if head is not None else 0 for i, head in enumerate(heads)]
|
||||||
|
if deps is not None:
|
||||||
|
MISSING_DEP_ = self.vocab.strings[MISSING_DEP]
|
||||||
|
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
|
||||||
if deps and not heads:
|
if deps and not heads:
|
||||||
heads = [0] * len(deps)
|
heads = [0] * len(deps)
|
||||||
if sent_starts is not None:
|
if sent_starts is not None:
|
||||||
|
@ -330,7 +334,8 @@ cdef class Doc:
|
||||||
if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
|
if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
|
||||||
values.extend(annot)
|
values.extend(annot)
|
||||||
for value in values:
|
for value in values:
|
||||||
self.vocab.strings.add(value)
|
if value is not None:
|
||||||
|
self.vocab.strings.add(value)
|
||||||
|
|
||||||
# if there are any other annotations, set them
|
# if there are any other annotations, set them
|
||||||
if headings:
|
if headings:
|
||||||
|
@ -1533,7 +1538,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
|
||||||
for i in range(start, end):
|
for i in range(start, end):
|
||||||
tokens[i].sent_start = -1
|
tokens[i].sent_start = -1
|
||||||
for i in range(start, end):
|
for i in range(start, end):
|
||||||
if tokens[i].head == 0:
|
if tokens[i].head == 0 and not Token.missing_head(&tokens[i]):
|
||||||
tokens[tokens[i].l_edge].sent_start = 1
|
tokens[tokens[i].l_edge].sent_start = 1
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..lexeme cimport Lexeme
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
cdef int MISSING_DEP = 0
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
@ -94,3 +95,13 @@ cdef class Token:
|
||||||
token.ent_kb_id = value
|
token.ent_kb_id = value
|
||||||
elif feat_name == SENT_START:
|
elif feat_name == SENT_START:
|
||||||
token.sent_start = value
|
token.sent_start = value
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
cdef inline int missing_dep(const TokenC* token) nogil:
|
||||||
|
return token.dep == MISSING_DEP
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
cdef inline int missing_head(const TokenC* token) nogil:
|
||||||
|
return Token.missing_dep(token)
|
||||||
|
|
|
@ -638,14 +638,26 @@ cdef class Token:
|
||||||
return False
|
return False
|
||||||
return any(ancestor.i == self.i for ancestor in descendant.ancestors)
|
return any(ancestor.i == self.i for ancestor in descendant.ancestors)
|
||||||
|
|
||||||
|
def has_head(self):
|
||||||
|
"""Check whether the token has annotated head information.
|
||||||
|
Return False when the head annotation is unset/missing.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the head annotation is valid or not.
|
||||||
|
"""
|
||||||
|
return not Token.missing_head(self.c)
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
"""The syntactic parent, or "governor", of this token.
|
"""The syntactic parent, or "governor", of this token.
|
||||||
|
If token.has_head() is `False`, this method will return itself.
|
||||||
|
|
||||||
RETURNS (Token): The token predicted by the parser to be the head of
|
RETURNS (Token): The token predicted by the parser to be the head of
|
||||||
the current token.
|
the current token.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc[self.i + self.c.head]
|
if not self.has_head():
|
||||||
|
return self
|
||||||
|
else:
|
||||||
|
return self.doc[self.i + self.c.head]
|
||||||
|
|
||||||
def __set__(self, Token new_head):
|
def __set__(self, Token new_head):
|
||||||
# This function sets the head of self to new_head and updates the
|
# This function sets the head of self to new_head and updates the
|
||||||
|
@ -858,6 +870,14 @@ cdef class Token:
|
||||||
def __set__(self, tag):
|
def __set__(self, tag):
|
||||||
self.tag = self.vocab.strings.add(tag)
|
self.tag = self.vocab.strings.add(tag)
|
||||||
|
|
||||||
|
def has_dep(self):
|
||||||
|
"""Check whether the token has annotated dep information.
|
||||||
|
Returns False when the dep label is unset/missing.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the dep label is valid or not.
|
||||||
|
"""
|
||||||
|
return not Token.missing_dep(self.c)
|
||||||
|
|
||||||
property dep_:
|
property dep_:
|
||||||
"""RETURNS (str): The syntactic dependency label."""
|
"""RETURNS (str): The syntactic dependency label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -12,6 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
|
||||||
from .iob_utils import biluo_tags_to_spans
|
from .iob_utils import biluo_tags_to_spans
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
|
from ..tokens.token cimport MISSING_DEP
|
||||||
from ..util import logger
|
from ..util import logger
|
||||||
|
|
||||||
|
|
||||||
|
@ -179,10 +180,15 @@ cdef class Example:
|
||||||
gold_to_cand = self.alignment.y2x
|
gold_to_cand = self.alignment.y2x
|
||||||
aligned_heads = [None] * self.x.length
|
aligned_heads = [None] * self.x.length
|
||||||
aligned_deps = [None] * self.x.length
|
aligned_deps = [None] * self.x.length
|
||||||
|
has_deps = [token.has_dep() for token in self.y]
|
||||||
|
has_heads = [token.has_head() for token in self.y]
|
||||||
heads = [token.head.i for token in self.y]
|
heads = [token.head.i for token in self.y]
|
||||||
deps = [token.dep_ for token in self.y]
|
deps = [token.dep_ for token in self.y]
|
||||||
if projectivize:
|
if projectivize:
|
||||||
heads, deps = nonproj.projectivize(heads, deps)
|
proj_heads, proj_deps = nonproj.projectivize(heads, deps)
|
||||||
|
# ensure that missing data remains missing
|
||||||
|
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
|
||||||
|
deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
|
||||||
for cand_i in range(self.x.length):
|
for cand_i in range(self.x.length):
|
||||||
if cand_to_gold.lengths[cand_i] == 1:
|
if cand_to_gold.lengths[cand_i] == 1:
|
||||||
gold_i = cand_to_gold[cand_i].dataXd[0, 0]
|
gold_i = cand_to_gold[cand_i].dataXd[0, 0]
|
||||||
|
@ -329,7 +335,10 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
pass
|
pass
|
||||||
elif key == "HEAD":
|
elif key == "HEAD":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([h-i for i, h in enumerate(value)])
|
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
||||||
|
elif key == "DEP":
|
||||||
|
attrs.append(key)
|
||||||
|
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
||||||
elif key == "SENT_START":
|
elif key == "SENT_START":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append(value)
|
values.append(value)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user