From 1d5cad0b42c5919dde27a59808ff97f8e15cfaa0 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Tue, 28 Jun 2022 19:42:58 +0200
Subject: [PATCH] `Example.get_aligned_parse`: Handle unit and zero length
 vectors correctly (#11026)

* `Example.get_aligned_parse`: Do not squeeze gold token idx vector
Correctly handle zero-size vectors passed to `np.vectorize`

* Add tests

* Use `Doc` ctor to initialize attributes

* Remove unintended change

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Remove unused import

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/tests/training/test_training.py | 25 +++++++++++++++++++++++++
 spacy/training/example.pyx            |  6 +++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 31bf7e07b..4384a796d 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -679,6 +679,31 @@ def test_projectivize(en_tokenizer):
     assert proj_heads == [3, 2, 3, 3, 3]
     assert nonproj_heads == [3, 2, 3, 3, 2]
 
+    # Test single token documents
+    doc = en_tokenizer("Conrail")
+    heads = [0]
+    deps = ["dep"]
+    example = Example.from_dict(doc, {"heads": heads, "deps": deps})
+    proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
+    assert proj_heads == heads
+    assert proj_labels == deps
+
+    # Test documents with no alignments
+    doc_a = Doc(
+        doc.vocab, words=["Double-Jointed"], spaces=[False], deps=["ROOT"], heads=[0]
+    )
+    doc_b = Doc(
+        doc.vocab,
+        words=["Double", "-", "Jointed"],
+        spaces=[True, True, True],
+        deps=["amod", "punct", "ROOT"],
+        heads=[2, 2, 2],
+    )
+    example = Example(doc_a, doc_b)
+    proj_heads, proj_deps = example.get_aligned_parse(projectivize=True)
+    assert proj_heads == [None]
+    assert proj_deps == [None]
+
 
 def test_iob_to_biluo():
     good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 473364f93..d592e5a52 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -249,9 +249,9 @@ cdef class Example:
         # Fetch all aligned gold token incides.
         if c2g_single_toks.shape == cand_to_gold.lengths.shape:
             # This the most likely case.
-            gold_i = cand_to_gold[:].squeeze()
+            gold_i = cand_to_gold[:]
         else:
-            gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0])(c2g_single_toks).squeeze()
+            gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0], otypes='i')(c2g_single_toks)
 
         # Fetch indices of all gold heads for the aligned gold tokens.
         heads = numpy.asarray(heads, dtype='i')
@@ -261,7 +261,7 @@ cdef class Example:
         # gold tokens (and are aligned to a single candidate token).
         g2c_len_heads = gold_to_cand.lengths[gold_head_i]
         g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
-        g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0])(gold_head_i[g2c_len_heads]).squeeze()
+        g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0], otypes='i')(gold_head_i[g2c_len_heads]).squeeze()
 
         # Update head/dep alignments with the above.
         aligned_heads = numpy.full((self.x.length), None)