Represent fused tokens in GoldParse

Entries in GoldParse.{words, heads, tags, deps, ner} can now be lists
instead of single values, to handle getting the analysis for fused
tokens. For instance, let's say we have a token like "hows", while the
gold-standard has two tokens, ["how", "s"]. We need to store the gold
data for each of the two subtokens.

Example gold.words: [["how", "s"], "it", "going"]

Things get more complicated for heads, as we need to address particular
subtokens. Let's say the gold heads for ["how", "s", "it", "going"] is
[1, 1, 3, 1], i.e. the root "s" is within the subtoken. The gold.heads
list would be:

    [[(0, 1), (0, 1)], 2, (0, 1)]

The tuples indicate token 0, subtoken 1. A helper method
_flatten_fused_heads is available that unpacks the above to
[1, 1, 3, 1].
This commit is contained in:
Matthew Honnibal 2018-04-01 17:18:18 +02:00
parent a64680c137
commit 3d182fbc43

View File

@ -72,7 +72,7 @@ punct_re = re.compile(r'\W')
def align(cand_words, gold_words):
if cand_words == gold_words:
alignment = numpy.arange(len(cand_words))
return 0, alignment, alignment, {}, {}
return 0, alignment, alignment, {}, {}, []
cand_words = [w.replace(' ', '') for w in cand_words]
gold_words = [w.replace(' ', '') for w in gold_words]
cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
@ -86,7 +86,12 @@ def align(cand_words, gold_words):
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
j2i[j] = i
j2i_multi.pop(j)
return cost, i2j, j2i, i2j_multi, j2i_multi
reverse_j2i, reverse_i2j = _align.multi_align(j2i, i2j, [len(w) for w in gold_words],
[len(w) for w in cand_words])
undersegmented = {}
for j, i in reverse_j2i.items():
undersegmented.setdefault(i, []).append(j)
return cost, i2j, j2i, i2j_multi, j2i_multi, undersegmented
class GoldCorpus(object):
@ -380,6 +385,47 @@ def _consume_ent(tags):
return [start] + middle + [end]
def _flatten_fused_heads(heads):
'''Let's say we have a heads array with fused tokens. We might have
something like:
[[(0, 1), 1], 1]
This indicates that token 0 aligns to two gold tokens. The head of the
first subtoken is the second subtoken. The head of the second subtoken
is the second token.
So we expand to a tree:
[1, 2, 2]
This is helpful for preventing other functions from knowing our weird
format.
'''
# Get an alignment -- normalize to the more complicated format; so
# if we have an int i, treat it as [(i, 0)]
j = 0
alignment = {(None, 0): None}
for i, tokens in enumerate(heads):
if not isinstance(tokens, list):
alignment[(i, 0)] = j
j += 1
else:
for sub_i in range(len(tokens)):
alignment[(i, sub_i)] = j
j += 1
# Apply the alignment to get the new values
new = []
for head_vals in heads:
if not isinstance(head_vals, list):
head_vals = [(head_vals, 0)]
for head_val in head_vals:
if not isinstance(head_val, tuple):
head_val = (head_val, 0)
new.append(alignment[head_val])
return new
cdef class GoldParse:
"""Collection for training annotations."""
@classmethod
@ -418,15 +464,15 @@ cdef class GoldParse:
if words is None:
words = [token.text for token in doc]
if tags is None:
tags = [None for _ in doc]
tags = [None for _ in words]
if heads is None:
heads = [None for token in doc]
heads = [None for token in words]
if deps is None:
deps = [None for _ in doc]
deps = [None for _ in words]
if entities is None:
entities = [None for _ in doc]
entities = [None for _ in words]
elif len(entities) == 0:
entities = ['O' for _ in doc]
entities = ['O' for _ in words]
elif not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
@ -462,7 +508,7 @@ cdef class GoldParse:
# sequence of gold words.
# If we "mis-segment", we'll have a sequence of predicted words covering
# a sequence of gold words. That's many-to-many -- we don't do that.
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
cost, i2j, j2i, i2j_multi, j2i_multi, undersegmented = align([t.orth_ for t in doc], words)
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
@ -478,7 +524,18 @@ cdef class GoldParse:
self.labels[i] = None
self.ner[i] = 'O'
if gold_i is None:
if i in i2j_multi:
if i in undersegmented:
self.words[i] = [words[j] for j in undersegmented[i]]
self.tags[i] = [tags[j] for j in undersegmented[i]]
self.labels[i] = [deps[j] for j in undersegmented[i]]
self.ner[i] = [entities[j] for j in undersegmented[i]]
self.heads[i] = []
for h in [heads[j] for j in undersegmented[i]]:
if heads[h] is None:
self.heads[i].append(None)
else:
self.heads[i].append(self.gold_to_cand[heads[h]])
elif i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
@ -495,25 +552,26 @@ cdef class GoldParse:
# BILOU tags. We can't have BB or LL etc.
# Case 1: O -- easy.
ner_tag = entities[i2j_multi[i]]
if ner_tag == 'O':
self.ner[i] = 'O'
# Case 2: U. This has to become a B I* L sequence.
elif ner_tag.startswith('U-'):
if is_first:
self.ner[i] = ner_tag.replace('U-', 'B-', 1)
elif is_last:
self.ner[i] = ner_tag.replace('U-', 'L-', 1)
else:
self.ner[i] = ner_tag.replace('U-', 'I-', 1)
# Case 3: L. If not last, change to I.
elif ner_tag.startswith('L-'):
if is_last:
if ner_tag is not None:
if ner_tag == 'O':
self.ner[i] = 'O'
# Case 2: U. This has to become a B I* L sequence.
elif ner_tag.startswith('U-'):
if is_first:
self.ner[i] = ner_tag.replace('U-', 'B-', 1)
elif is_last:
self.ner[i] = ner_tag.replace('U-', 'L-', 1)
else:
self.ner[i] = ner_tag.replace('U-', 'I-', 1)
# Case 3: L. If not last, change to I.
elif ner_tag.startswith('L-'):
if is_last:
self.ner[i] = ner_tag
else:
self.ner[i] = ner_tag.replace('L-', 'I-', 1)
# Case 4: I. Stays correct
elif ner_tag.startswith('I-'):
self.ner[i] = ner_tag
else:
self.ner[i] = ner_tag.replace('L-', 'I-', 1)
# Case 4: I. Stays correct
elif ner_tag.startswith('I-'):
self.ner[i] = ner_tag
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
@ -523,8 +581,7 @@ cdef class GoldParse:
self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i]
cycle = nonproj.contains_cycle(self.heads)
cycle = nonproj.contains_cycle(_flatten_fused_heads(self.heads))
if cycle is not None:
raise Exception("Cycle found: %s" % cycle)