From b8461e71b7552f9a62be7e6393c1655fd4c7c187 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 1 Apr 2018 18:03:48 +0200 Subject: [PATCH] Prepare ArcEager.preprocess_gold to handle subtokens --- spacy/syntax/arc_eager.pyx | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index d04328d44..6c76a2547 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -474,16 +474,21 @@ cdef class ArcEager(TransitionSystem): def preprocess_gold(self, GoldParse gold): if not self.has_gold(gold): return None - for i, (head, dep) in enumerate(zip(gold.heads, gold.labels)): + for i, (head_group, dep_group) in enumerate(zip(gold.heads, gold.labels)): # Missing values - if head is None or dep is None: + if head_group is None or dep_group is None: gold.c.heads[i] = i gold.c.has_dep[i] = False - elif isinstance(head, list): - # TODO: This is where the fused token stuff will happen - gold.c.heads[i] = i - gold.c.has_dep[i] = False - else: + continue + if not isinstance(head_group, list): + # Map the simple format into the elaborate one we need for + # the fused tokens. + head_group = [(head_group, 0)] + dep_group = [dep_group] + for head_addr, dep in zip(head_group, dep_group): + if not isinstance(head_addr, tuple): + head_addr = (head_addr, 0) + head, subtoken = head_addr if head > i: action = LEFT elif head < i: