From 06a5be9dfda12d87aa658c42c00b68eabedf48a6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Apr 2018 00:55:05 +0200 Subject: [PATCH] Fix handling of heads for undersegmented tokens --- spacy/_align.pyx | 4 +++- spacy/gold.pxd | 2 -- spacy/gold.pyx | 51 ++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 44 insertions(+), 13 deletions(-) diff --git a/spacy/_align.pyx b/spacy/_align.pyx index 750b88c44..cba117f94 100644 --- a/spacy/_align.pyx +++ b/spacy/_align.pyx @@ -109,7 +109,9 @@ class Alignment(object): ''' output = [] for i, alignment in enumerate(self._y2t): - if isinstance(alignment, int): + if alignment is None: + output.append(None) + elif isinstance(alignment, int): output.append(items[alignment]) elif isinstance(alignment, tuple): output.append((items[alignment[0]], alignment[1])) diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 2be87b72a..6c268959b 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -34,8 +34,6 @@ cdef class GoldParse: cdef public object cats cdef public object _alignment - cdef readonly list cand_to_gold - cdef readonly list gold_to_cand cdef readonly list orig_annot diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 279d218dc..223030e34 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -443,19 +443,45 @@ cdef class GoldParse: # sequence of gold words. # If we "mis-segment", we'll have a sequence of predicted words covering # a sequence of gold words. That's many-to-many -- we don't do that. - self._alignment = Alignment([t.orth_ for t in doc], words) + if words is not None: + self._alignment = Alignment([t.text for t in doc], words) + else: + self._alignment = Alignment([t.text for t in doc], [t.text for t in doc]) annot_tuples = (range(len(words)), words, tags, heads, deps, entities) self.orig_annot = list(zip(*annot_tuples)) - self.words = self._alignment.to_yours(words) - self.tags = self._alignment.to_yours(tags) - self.labels = self._alignment.to_yours(deps) - self.tags = self._alignment.to_yours(tags) - self.ner = self._alignment.to_yours(entities) - - aligned_heads = [self._alignment.index_to_yours(h) for h in heads] - self.heads = self._alignment.to_yours(aligned_heads) + if words is not None: + self.words = self._alignment.to_yours(words) + if tags is not None: + self.tags = self._alignment.to_yours(tags) + if deps is not None: + self.labels = self._alignment.to_yours(deps) + if tags is not None: + self.tags = self._alignment.to_yours(tags) + if entities is not None: + self.ner = self._alignment.to_yours(entities) + if heads is not None: + for gold_i, gold_head in enumerate(heads): + if gold_head is None: + continue + cand_i = self._alignment._t2y[gold_i] + cand_head = self._alignment._t2y[gold_head] + if cand_i is None or cand_head is None: + continue + elif isinstance(cand_i, int): + self.heads[cand_i] = cand_head + elif isinstance(cand_i, list): + for sub_i in cand_i[:-1]: + self.heads[sub_i] = sub_i+1 + self.heads[cand_i[-1]] = cand_head + elif isinstance(cand_i, tuple): + cand_i, sub_i = cand_i + if not isinstance(self.heads[cand_i], list): + self.heads[cand_i] = [] + while len(self.heads[cand_i]) <= sub_i: + self.heads[cand_i].append(None) + self.heads[cand_i][sub_i] = cand_head for i in range(len(doc)): # Fix spaces @@ -472,13 +498,18 @@ cdef class GoldParse: or not isinstance(self.labels[i+1], tuple) \ or self.labels[i][1] < sub_i: self.labels[i] = self.labels[i][0] - self.heads[i] = self.heads[i][0] else: self.labels[i] = 'subtok' self.heads[i] = i+1 cycle = nonproj.contains_cycle(self._alignment.flatten(self.heads)) if cycle is not None: + print(repr(doc.text)) + print([t.text for t in doc]) + print(words) + print(self.labels) + print(list(enumerate(self.heads))) + print(heads) raise Exception("Cycle found: %s" % cycle) def __len__(self):