diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 46f16a229..c534c2d9b 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -72,7 +72,7 @@ punct_re = re.compile(r'\W') def align(cand_words, gold_words): if cand_words == gold_words: alignment = numpy.arange(len(cand_words)) - return 0, alignment, alignment, {}, {} + return 0, alignment, alignment, {}, {}, [] cand_words = [w.replace(' ', '') for w in cand_words] gold_words = [w.replace(' ', '') for w in gold_words] cost, i2j, j2i, matrix = _align.align(cand_words, gold_words) @@ -86,7 +86,12 @@ def align(cand_words, gold_words): if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i: j2i[j] = i j2i_multi.pop(j) - return cost, i2j, j2i, i2j_multi, j2i_multi + reverse_j2i, reverse_i2j = _align.multi_align(j2i, i2j, [len(w) for w in gold_words], + [len(w) for w in cand_words]) + undersegmented = {} + for j, i in reverse_j2i.items(): + undersegmented.setdefault(i, []).append(j) + return cost, i2j, j2i, i2j_multi, j2i_multi, undersegmented class GoldCorpus(object): @@ -380,6 +385,47 @@ def _consume_ent(tags): return [start] + middle + [end] +def _flatten_fused_heads(heads): + '''Let's say we have a heads array with fused tokens. We might have + something like: + + [[(0, 1), 1], 1] + + This indicates that token 0 aligns to two gold tokens. The head of the + first subtoken is the second subtoken. The head of the second subtoken + is the second token. + + So we expand to a tree: + + [1, 2, 2] + + This is helpful for preventing other functions from knowing our weird + format. + ''' + # Get an alignment -- normalize to the more complicated format; so + # if we have an int i, treat it as [(i, 0)] + j = 0 + alignment = {(None, 0): None} + for i, tokens in enumerate(heads): + if not isinstance(tokens, list): + alignment[(i, 0)] = j + j += 1 + else: + for sub_i in range(len(tokens)): + alignment[(i, sub_i)] = j + j += 1 + # Apply the alignment to get the new values + new = [] + for head_vals in heads: + if not isinstance(head_vals, list): + head_vals = [(head_vals, 0)] + for head_val in head_vals: + if not isinstance(head_val, tuple): + head_val = (head_val, 0) + new.append(alignment[head_val]) + return new + + cdef class GoldParse: """Collection for training annotations.""" @classmethod @@ -418,15 +464,15 @@ cdef class GoldParse: if words is None: words = [token.text for token in doc] if tags is None: - tags = [None for _ in doc] + tags = [None for _ in words] if heads is None: - heads = [None for token in doc] + heads = [None for token in words] if deps is None: - deps = [None for _ in doc] + deps = [None for _ in words] if entities is None: - entities = [None for _ in doc] + entities = [None for _ in words] elif len(entities) == 0: - entities = ['O' for _ in doc] + entities = ['O' for _ in words] elif not isinstance(entities[0], basestring): # Assume we have entities specified by character offset. entities = biluo_tags_from_offsets(doc, entities) @@ -462,7 +508,7 @@ cdef class GoldParse: # sequence of gold words. # If we "mis-segment", we'll have a sequence of predicted words covering # a sequence of gold words. That's many-to-many -- we don't do that. - cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) + cost, i2j, j2i, i2j_multi, j2i_multi, undersegmented = align([t.orth_ for t in doc], words) self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] @@ -478,7 +524,18 @@ cdef class GoldParse: self.labels[i] = None self.ner[i] = 'O' if gold_i is None: - if i in i2j_multi: + if i in undersegmented: + self.words[i] = [words[j] for j in undersegmented[i]] + self.tags[i] = [tags[j] for j in undersegmented[i]] + self.labels[i] = [deps[j] for j in undersegmented[i]] + self.ner[i] = [entities[j] for j in undersegmented[i]] + self.heads[i] = [] + for h in [heads[j] for j in undersegmented[i]]: + if heads[h] is None: + self.heads[i].append(None) + else: + self.heads[i].append(self.gold_to_cand[heads[h]]) + elif i in i2j_multi: self.words[i] = words[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) @@ -495,25 +552,26 @@ cdef class GoldParse: # BILOU tags. We can't have BB or LL etc. # Case 1: O -- easy. ner_tag = entities[i2j_multi[i]] - if ner_tag == 'O': - self.ner[i] = 'O' - # Case 2: U. This has to become a B I* L sequence. - elif ner_tag.startswith('U-'): - if is_first: - self.ner[i] = ner_tag.replace('U-', 'B-', 1) - elif is_last: - self.ner[i] = ner_tag.replace('U-', 'L-', 1) - else: - self.ner[i] = ner_tag.replace('U-', 'I-', 1) - # Case 3: L. If not last, change to I. - elif ner_tag.startswith('L-'): - if is_last: + if ner_tag is not None: + if ner_tag == 'O': + self.ner[i] = 'O' + # Case 2: U. This has to become a B I* L sequence. + elif ner_tag.startswith('U-'): + if is_first: + self.ner[i] = ner_tag.replace('U-', 'B-', 1) + elif is_last: + self.ner[i] = ner_tag.replace('U-', 'L-', 1) + else: + self.ner[i] = ner_tag.replace('U-', 'I-', 1) + # Case 3: L. If not last, change to I. + elif ner_tag.startswith('L-'): + if is_last: + self.ner[i] = ner_tag + else: + self.ner[i] = ner_tag.replace('L-', 'I-', 1) + # Case 4: I. Stays correct + elif ner_tag.startswith('I-'): self.ner[i] = ner_tag - else: - self.ner[i] = ner_tag.replace('L-', 'I-', 1) - # Case 4: I. Stays correct - elif ner_tag.startswith('I-'): - self.ner[i] = ner_tag else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] @@ -523,8 +581,7 @@ cdef class GoldParse: self.heads[i] = self.gold_to_cand[heads[gold_i]] self.labels[i] = deps[gold_i] self.ner[i] = entities[gold_i] - - cycle = nonproj.contains_cycle(self.heads) + cycle = nonproj.contains_cycle(_flatten_fused_heads(self.heads)) if cycle is not None: raise Exception("Cycle found: %s" % cycle)