From 06a5be9dfda12d87aa658c42c00b68eabedf48a6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 3 Apr 2018 00:55:05 +0200
Subject: [PATCH] Fix handling of heads for undersegmented tokens

---
 spacy/_align.pyx |  4 +++-
 spacy/gold.pxd   |  2 --
 spacy/gold.pyx   | 51 ++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/spacy/_align.pyx b/spacy/_align.pyx
index 750b88c44..cba117f94 100644
--- a/spacy/_align.pyx
+++ b/spacy/_align.pyx
@@ -109,7 +109,9 @@ class Alignment(object):
         '''
         output = []
         for i, alignment in enumerate(self._y2t):
-            if isinstance(alignment, int):
+            if alignment is None:
+                output.append(None)
+            elif isinstance(alignment, int):
                 output.append(items[alignment])
             elif isinstance(alignment, tuple):
                 output.append((items[alignment[0]], alignment[1]))
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index 2be87b72a..6c268959b 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -34,8 +34,6 @@ cdef class GoldParse:
     cdef public object cats
     cdef public object _alignment
 
-    cdef readonly list cand_to_gold
-    cdef readonly list gold_to_cand
     cdef readonly list orig_annot
 
 
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 279d218dc..223030e34 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -443,19 +443,45 @@ cdef class GoldParse:
         # sequence of gold words.
         # If we "mis-segment", we'll have a sequence of predicted words covering
         # a sequence of gold words. That's many-to-many -- we don't do that.
-        self._alignment = Alignment([t.orth_ for t in doc], words)
+        if words is not None:
+            self._alignment = Alignment([t.text for t in doc], words)
+        else:
+            self._alignment = Alignment([t.text for t in doc], [t.text for t in doc])
 
         annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
         self.orig_annot = list(zip(*annot_tuples))
 
-        self.words = self._alignment.to_yours(words)
-        self.tags = self._alignment.to_yours(tags)
-        self.labels = self._alignment.to_yours(deps)
-        self.tags = self._alignment.to_yours(tags)
-        self.ner = self._alignment.to_yours(entities)
-
-        aligned_heads = [self._alignment.index_to_yours(h) for h in heads]
-        self.heads = self._alignment.to_yours(aligned_heads)
+        if words is not None:
+            self.words = self._alignment.to_yours(words)
+        if tags is not None:
+            self.tags = self._alignment.to_yours(tags)
+        if deps is not None:
+            self.labels = self._alignment.to_yours(deps)
+        if tags is not None:
+            self.tags = self._alignment.to_yours(tags)
+        if entities is not None:
+            self.ner = self._alignment.to_yours(entities)
+        if heads is not None:
+            for gold_i, gold_head in enumerate(heads):
+                if gold_head is None:
+                    continue
+                cand_i = self._alignment._t2y[gold_i]
+                cand_head = self._alignment._t2y[gold_head]
+                if cand_i is None or cand_head is None:
+                    continue
+                elif isinstance(cand_i, int):
+                    self.heads[cand_i] = cand_head
+                elif isinstance(cand_i, list):
+                    for sub_i in cand_i[:-1]:
+                        self.heads[sub_i] = sub_i+1
+                    self.heads[cand_i[-1]] = cand_head
+                elif isinstance(cand_i, tuple):
+                    cand_i, sub_i = cand_i
+                    if not isinstance(self.heads[cand_i], list):
+                        self.heads[cand_i] = []
+                    while len(self.heads[cand_i]) <= sub_i:
+                        self.heads[cand_i].append(None)
+                    self.heads[cand_i][sub_i] = cand_head
 
         for i in range(len(doc)):
             # Fix spaces
@@ -472,13 +498,18 @@ cdef class GoldParse:
                 or not isinstance(self.labels[i+1], tuple) \
                 or self.labels[i][1] < sub_i:
                     self.labels[i] = self.labels[i][0]
-                    self.heads[i] = self.heads[i][0]
                 else:
                     self.labels[i] = 'subtok'
                     self.heads[i] = i+1
 
         cycle = nonproj.contains_cycle(self._alignment.flatten(self.heads))
         if cycle is not None:
+            print(repr(doc.text))
+            print([t.text for t in doc])
+            print(words)
+            print(self.labels)
+            print(list(enumerate(self.heads)))
+            print(heads)
             raise Exception("Cycle found: %s" % cycle)
 
     def __len__(self):