mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-10 15:14:56 +03:00
Represent fused tokens in GoldParse
Entries in GoldParse.{words, heads, tags, deps, ner} can now be lists instead of single values, to handle getting the analysis for fused tokens. For instance, let's say we have a token like "hows", while the gold-standard has two tokens, ["how", "s"]. We need to store the gold data for each of the two subtokens. Example gold.words: [["how", "s"], "it", "going"] Things get more complicated for heads, as we need to address particular subtokens. Let's say the gold heads for ["how", "s", "it", "going"] is [1, 1, 3, 1], i.e. the root "s" is within the subtoken. The gold.heads list would be: [[(0, 1), (0, 1)], 2, (0, 1)] The tuples indicate token 0, subtoken 1. A helper method _flatten_fused_heads is available that unpacks the above to [1, 1, 3, 1].
This commit is contained in:
parent
a64680c137
commit
3d182fbc43
115
spacy/gold.pyx
115
spacy/gold.pyx
|
@ -72,7 +72,7 @@ punct_re = re.compile(r'\W')
|
|||
def align(cand_words, gold_words):
|
||||
if cand_words == gold_words:
|
||||
alignment = numpy.arange(len(cand_words))
|
||||
return 0, alignment, alignment, {}, {}
|
||||
return 0, alignment, alignment, {}, {}, []
|
||||
cand_words = [w.replace(' ', '') for w in cand_words]
|
||||
gold_words = [w.replace(' ', '') for w in gold_words]
|
||||
cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
|
||||
|
@ -86,7 +86,12 @@ def align(cand_words, gold_words):
|
|||
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
|
||||
j2i[j] = i
|
||||
j2i_multi.pop(j)
|
||||
return cost, i2j, j2i, i2j_multi, j2i_multi
|
||||
reverse_j2i, reverse_i2j = _align.multi_align(j2i, i2j, [len(w) for w in gold_words],
|
||||
[len(w) for w in cand_words])
|
||||
undersegmented = {}
|
||||
for j, i in reverse_j2i.items():
|
||||
undersegmented.setdefault(i, []).append(j)
|
||||
return cost, i2j, j2i, i2j_multi, j2i_multi, undersegmented
|
||||
|
||||
|
||||
class GoldCorpus(object):
|
||||
|
@ -380,6 +385,47 @@ def _consume_ent(tags):
|
|||
return [start] + middle + [end]
|
||||
|
||||
|
||||
def _flatten_fused_heads(heads):
|
||||
'''Let's say we have a heads array with fused tokens. We might have
|
||||
something like:
|
||||
|
||||
[[(0, 1), 1], 1]
|
||||
|
||||
This indicates that token 0 aligns to two gold tokens. The head of the
|
||||
first subtoken is the second subtoken. The head of the second subtoken
|
||||
is the second token.
|
||||
|
||||
So we expand to a tree:
|
||||
|
||||
[1, 2, 2]
|
||||
|
||||
This is helpful for preventing other functions from knowing our weird
|
||||
format.
|
||||
'''
|
||||
# Get an alignment -- normalize to the more complicated format; so
|
||||
# if we have an int i, treat it as [(i, 0)]
|
||||
j = 0
|
||||
alignment = {(None, 0): None}
|
||||
for i, tokens in enumerate(heads):
|
||||
if not isinstance(tokens, list):
|
||||
alignment[(i, 0)] = j
|
||||
j += 1
|
||||
else:
|
||||
for sub_i in range(len(tokens)):
|
||||
alignment[(i, sub_i)] = j
|
||||
j += 1
|
||||
# Apply the alignment to get the new values
|
||||
new = []
|
||||
for head_vals in heads:
|
||||
if not isinstance(head_vals, list):
|
||||
head_vals = [(head_vals, 0)]
|
||||
for head_val in head_vals:
|
||||
if not isinstance(head_val, tuple):
|
||||
head_val = (head_val, 0)
|
||||
new.append(alignment[head_val])
|
||||
return new
|
||||
|
||||
|
||||
cdef class GoldParse:
|
||||
"""Collection for training annotations."""
|
||||
@classmethod
|
||||
|
@ -418,15 +464,15 @@ cdef class GoldParse:
|
|||
if words is None:
|
||||
words = [token.text for token in doc]
|
||||
if tags is None:
|
||||
tags = [None for _ in doc]
|
||||
tags = [None for _ in words]
|
||||
if heads is None:
|
||||
heads = [None for token in doc]
|
||||
heads = [None for token in words]
|
||||
if deps is None:
|
||||
deps = [None for _ in doc]
|
||||
deps = [None for _ in words]
|
||||
if entities is None:
|
||||
entities = [None for _ in doc]
|
||||
entities = [None for _ in words]
|
||||
elif len(entities) == 0:
|
||||
entities = ['O' for _ in doc]
|
||||
entities = ['O' for _ in words]
|
||||
elif not isinstance(entities[0], basestring):
|
||||
# Assume we have entities specified by character offset.
|
||||
entities = biluo_tags_from_offsets(doc, entities)
|
||||
|
@ -462,7 +508,7 @@ cdef class GoldParse:
|
|||
# sequence of gold words.
|
||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||
# a sequence of gold words. That's many-to-many -- we don't do that.
|
||||
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
||||
cost, i2j, j2i, i2j_multi, j2i_multi, undersegmented = align([t.orth_ for t in doc], words)
|
||||
|
||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||
|
@ -478,7 +524,18 @@ cdef class GoldParse:
|
|||
self.labels[i] = None
|
||||
self.ner[i] = 'O'
|
||||
if gold_i is None:
|
||||
if i in i2j_multi:
|
||||
if i in undersegmented:
|
||||
self.words[i] = [words[j] for j in undersegmented[i]]
|
||||
self.tags[i] = [tags[j] for j in undersegmented[i]]
|
||||
self.labels[i] = [deps[j] for j in undersegmented[i]]
|
||||
self.ner[i] = [entities[j] for j in undersegmented[i]]
|
||||
self.heads[i] = []
|
||||
for h in [heads[j] for j in undersegmented[i]]:
|
||||
if heads[h] is None:
|
||||
self.heads[i].append(None)
|
||||
else:
|
||||
self.heads[i].append(self.gold_to_cand[heads[h]])
|
||||
elif i in i2j_multi:
|
||||
self.words[i] = words[i2j_multi[i]]
|
||||
self.tags[i] = tags[i2j_multi[i]]
|
||||
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
||||
|
@ -495,25 +552,26 @@ cdef class GoldParse:
|
|||
# BILOU tags. We can't have BB or LL etc.
|
||||
# Case 1: O -- easy.
|
||||
ner_tag = entities[i2j_multi[i]]
|
||||
if ner_tag == 'O':
|
||||
self.ner[i] = 'O'
|
||||
# Case 2: U. This has to become a B I* L sequence.
|
||||
elif ner_tag.startswith('U-'):
|
||||
if is_first:
|
||||
self.ner[i] = ner_tag.replace('U-', 'B-', 1)
|
||||
elif is_last:
|
||||
self.ner[i] = ner_tag.replace('U-', 'L-', 1)
|
||||
else:
|
||||
self.ner[i] = ner_tag.replace('U-', 'I-', 1)
|
||||
# Case 3: L. If not last, change to I.
|
||||
elif ner_tag.startswith('L-'):
|
||||
if is_last:
|
||||
if ner_tag is not None:
|
||||
if ner_tag == 'O':
|
||||
self.ner[i] = 'O'
|
||||
# Case 2: U. This has to become a B I* L sequence.
|
||||
elif ner_tag.startswith('U-'):
|
||||
if is_first:
|
||||
self.ner[i] = ner_tag.replace('U-', 'B-', 1)
|
||||
elif is_last:
|
||||
self.ner[i] = ner_tag.replace('U-', 'L-', 1)
|
||||
else:
|
||||
self.ner[i] = ner_tag.replace('U-', 'I-', 1)
|
||||
# Case 3: L. If not last, change to I.
|
||||
elif ner_tag.startswith('L-'):
|
||||
if is_last:
|
||||
self.ner[i] = ner_tag
|
||||
else:
|
||||
self.ner[i] = ner_tag.replace('L-', 'I-', 1)
|
||||
# Case 4: I. Stays correct
|
||||
elif ner_tag.startswith('I-'):
|
||||
self.ner[i] = ner_tag
|
||||
else:
|
||||
self.ner[i] = ner_tag.replace('L-', 'I-', 1)
|
||||
# Case 4: I. Stays correct
|
||||
elif ner_tag.startswith('I-'):
|
||||
self.ner[i] = ner_tag
|
||||
else:
|
||||
self.words[i] = words[gold_i]
|
||||
self.tags[i] = tags[gold_i]
|
||||
|
@ -523,8 +581,7 @@ cdef class GoldParse:
|
|||
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
||||
self.labels[i] = deps[gold_i]
|
||||
self.ner[i] = entities[gold_i]
|
||||
|
||||
cycle = nonproj.contains_cycle(self.heads)
|
||||
cycle = nonproj.contains_cycle(_flatten_fused_heads(self.heads))
|
||||
if cycle is not None:
|
||||
raise Exception("Cycle found: %s" % cycle)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user