mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-13 16:44:56 +03:00
Fix handling of heads for undersegmented tokens
This commit is contained in:
parent
aa5ecf7fd2
commit
06a5be9dfd
|
@ -109,7 +109,9 @@ class Alignment(object):
|
||||||
'''
|
'''
|
||||||
output = []
|
output = []
|
||||||
for i, alignment in enumerate(self._y2t):
|
for i, alignment in enumerate(self._y2t):
|
||||||
if isinstance(alignment, int):
|
if alignment is None:
|
||||||
|
output.append(None)
|
||||||
|
elif isinstance(alignment, int):
|
||||||
output.append(items[alignment])
|
output.append(items[alignment])
|
||||||
elif isinstance(alignment, tuple):
|
elif isinstance(alignment, tuple):
|
||||||
output.append((items[alignment[0]], alignment[1]))
|
output.append((items[alignment[0]], alignment[1]))
|
||||||
|
|
|
@ -34,8 +34,6 @@ cdef class GoldParse:
|
||||||
cdef public object cats
|
cdef public object cats
|
||||||
cdef public object _alignment
|
cdef public object _alignment
|
||||||
|
|
||||||
cdef readonly list cand_to_gold
|
|
||||||
cdef readonly list gold_to_cand
|
|
||||||
cdef readonly list orig_annot
|
cdef readonly list orig_annot
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -443,19 +443,45 @@ cdef class GoldParse:
|
||||||
# sequence of gold words.
|
# sequence of gold words.
|
||||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||||
# a sequence of gold words. That's many-to-many -- we don't do that.
|
# a sequence of gold words. That's many-to-many -- we don't do that.
|
||||||
self._alignment = Alignment([t.orth_ for t in doc], words)
|
if words is not None:
|
||||||
|
self._alignment = Alignment([t.text for t in doc], words)
|
||||||
|
else:
|
||||||
|
self._alignment = Alignment([t.text for t in doc], [t.text for t in doc])
|
||||||
|
|
||||||
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
|
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
|
||||||
self.orig_annot = list(zip(*annot_tuples))
|
self.orig_annot = list(zip(*annot_tuples))
|
||||||
|
|
||||||
|
if words is not None:
|
||||||
self.words = self._alignment.to_yours(words)
|
self.words = self._alignment.to_yours(words)
|
||||||
|
if tags is not None:
|
||||||
self.tags = self._alignment.to_yours(tags)
|
self.tags = self._alignment.to_yours(tags)
|
||||||
|
if deps is not None:
|
||||||
self.labels = self._alignment.to_yours(deps)
|
self.labels = self._alignment.to_yours(deps)
|
||||||
|
if tags is not None:
|
||||||
self.tags = self._alignment.to_yours(tags)
|
self.tags = self._alignment.to_yours(tags)
|
||||||
|
if entities is not None:
|
||||||
self.ner = self._alignment.to_yours(entities)
|
self.ner = self._alignment.to_yours(entities)
|
||||||
|
if heads is not None:
|
||||||
aligned_heads = [self._alignment.index_to_yours(h) for h in heads]
|
for gold_i, gold_head in enumerate(heads):
|
||||||
self.heads = self._alignment.to_yours(aligned_heads)
|
if gold_head is None:
|
||||||
|
continue
|
||||||
|
cand_i = self._alignment._t2y[gold_i]
|
||||||
|
cand_head = self._alignment._t2y[gold_head]
|
||||||
|
if cand_i is None or cand_head is None:
|
||||||
|
continue
|
||||||
|
elif isinstance(cand_i, int):
|
||||||
|
self.heads[cand_i] = cand_head
|
||||||
|
elif isinstance(cand_i, list):
|
||||||
|
for sub_i in cand_i[:-1]:
|
||||||
|
self.heads[sub_i] = sub_i+1
|
||||||
|
self.heads[cand_i[-1]] = cand_head
|
||||||
|
elif isinstance(cand_i, tuple):
|
||||||
|
cand_i, sub_i = cand_i
|
||||||
|
if not isinstance(self.heads[cand_i], list):
|
||||||
|
self.heads[cand_i] = []
|
||||||
|
while len(self.heads[cand_i]) <= sub_i:
|
||||||
|
self.heads[cand_i].append(None)
|
||||||
|
self.heads[cand_i][sub_i] = cand_head
|
||||||
|
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
# Fix spaces
|
# Fix spaces
|
||||||
|
@ -472,13 +498,18 @@ cdef class GoldParse:
|
||||||
or not isinstance(self.labels[i+1], tuple) \
|
or not isinstance(self.labels[i+1], tuple) \
|
||||||
or self.labels[i][1] < sub_i:
|
or self.labels[i][1] < sub_i:
|
||||||
self.labels[i] = self.labels[i][0]
|
self.labels[i] = self.labels[i][0]
|
||||||
self.heads[i] = self.heads[i][0]
|
|
||||||
else:
|
else:
|
||||||
self.labels[i] = 'subtok'
|
self.labels[i] = 'subtok'
|
||||||
self.heads[i] = i+1
|
self.heads[i] = i+1
|
||||||
|
|
||||||
cycle = nonproj.contains_cycle(self._alignment.flatten(self.heads))
|
cycle = nonproj.contains_cycle(self._alignment.flatten(self.heads))
|
||||||
if cycle is not None:
|
if cycle is not None:
|
||||||
|
print(repr(doc.text))
|
||||||
|
print([t.text for t in doc])
|
||||||
|
print(words)
|
||||||
|
print(self.labels)
|
||||||
|
print(list(enumerate(self.heads)))
|
||||||
|
print(heads)
|
||||||
raise Exception("Cycle found: %s" % cycle)
|
raise Exception("Cycle found: %s" % cycle)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user