From 1d5cad0b42c5919dde27a59808ff97f8e15cfaa0 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Tue, 28 Jun 2022 19:42:58 +0200 Subject: [PATCH] `Example.get_aligned_parse`: Handle unit and zero length vectors correctly (#11026) * `Example.get_aligned_parse`: Do not squeeze gold token idx vector Correctly handle zero-size vectors passed to `np.vectorize` * Add tests * Use `Doc` ctor to initialize attributes * Remove unintended change Co-authored-by: Adriane Boyd * Remove unused import Co-authored-by: Adriane Boyd --- spacy/tests/training/test_training.py | 25 +++++++++++++++++++++++++ spacy/training/example.pyx | 6 +++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 31bf7e07b..4384a796d 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -679,6 +679,31 @@ def test_projectivize(en_tokenizer): assert proj_heads == [3, 2, 3, 3, 3] assert nonproj_heads == [3, 2, 3, 3, 2] + # Test single token documents + doc = en_tokenizer("Conrail") + heads = [0] + deps = ["dep"] + example = Example.from_dict(doc, {"heads": heads, "deps": deps}) + proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) + assert proj_heads == heads + assert proj_labels == deps + + # Test documents with no alignments + doc_a = Doc( + doc.vocab, words=["Double-Jointed"], spaces=[False], deps=["ROOT"], heads=[0] + ) + doc_b = Doc( + doc.vocab, + words=["Double", "-", "Jointed"], + spaces=[True, True, True], + deps=["amod", "punct", "ROOT"], + heads=[2, 2, 2], + ) + example = Example(doc_a, doc_b) + proj_heads, proj_deps = example.get_aligned_parse(projectivize=True) + assert proj_heads == [None] + assert proj_deps == [None] + def test_iob_to_biluo(): good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 473364f93..d592e5a52 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -249,9 +249,9 @@ cdef class Example: # Fetch all aligned gold token incides. if c2g_single_toks.shape == cand_to_gold.lengths.shape: # This the most likely case. - gold_i = cand_to_gold[:].squeeze() + gold_i = cand_to_gold[:] else: - gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0])(c2g_single_toks).squeeze() + gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0], otypes='i')(c2g_single_toks) # Fetch indices of all gold heads for the aligned gold tokens. heads = numpy.asarray(heads, dtype='i') @@ -261,7 +261,7 @@ cdef class Example: # gold tokens (and are aligned to a single candidate token). g2c_len_heads = gold_to_cand.lengths[gold_head_i] g2c_len_heads = numpy.where(g2c_len_heads == 1)[0] - g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0])(gold_head_i[g2c_len_heads]).squeeze() + g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0], otypes='i')(gold_head_i[g2c_len_heads]).squeeze() # Update head/dep alignments with the above. aligned_heads = numpy.full((self.x.length), None)