mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-10 15:14:56 +03:00
Refactor alignment into its own class
This commit is contained in:
parent
9c3612d40b
commit
e6641a11b1
188
spacy/_align.pyx
188
spacy/_align.pyx
|
@ -85,12 +85,134 @@ i.e. D[i,j+1] + 1
|
|||
from __future__ import unicode_literals
|
||||
from libc.stdint cimport uint32_t
|
||||
import numpy
|
||||
import copy
|
||||
cimport numpy as np
|
||||
from .compat import unicode_
|
||||
from murmurhash.mrmr cimport hash32
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def align(S, T):
|
||||
class Alignment(object):
|
||||
def __init__(self, your_words, their_words):
|
||||
cost, your2their, their2your = self._align(your_words, their_words)
|
||||
self.cost = cost
|
||||
self._y2t = your2their
|
||||
self._t2y = their2your
|
||||
|
||||
def to_yours(self, items):
|
||||
'''Translate a list of token annotations into your tokenization. Returns
|
||||
a list of length equal to your tokens. When one of your tokens aligns
|
||||
to multiple items, the entry will be a list. When multiple of your
|
||||
tokens align to one item, you'll get a tuple (value, index, n_to_go),
|
||||
where index is an int starting from 0, and n_to_go tells you how
|
||||
many remaining subtokens align to the same value.
|
||||
'''
|
||||
output = []
|
||||
for i, alignment in enumerate(self._t2y):
|
||||
if len(alignment) == 1 and alignment[0][1] == 0:
|
||||
output.append(items[alignment[0][0]])
|
||||
else:
|
||||
output.append([])
|
||||
for j1, j2 in alignment:
|
||||
output[-1].append((items[j1], j2))
|
||||
return output
|
||||
|
||||
def index_to_yours(self, index):
|
||||
'''Translate an index that points into their tokens to point into yours'''
|
||||
alignment = self._t2y[index]
|
||||
if len(alignment) == 1 and alignment[0][2] == 0:
|
||||
return alignment[0][0]
|
||||
else:
|
||||
output = []
|
||||
for i1, i2, n_to_go in alignment:
|
||||
output.append((i1, i2, n_to_go))
|
||||
return output
|
||||
|
||||
def to_theirs(self, items):
|
||||
raise NotImplementedError
|
||||
|
||||
def index_to_theirs(self, index):
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def _align(cls, cand_words, gold_words):
|
||||
'''Find best alignment between candidate tokenization and gold tokenization.
|
||||
|
||||
Returns the alignment cost and alignment tables in both directions:
|
||||
cand_to_gold and gold_to_cand
|
||||
|
||||
Alignment entries are lists of addresses, where an address is a tuple
|
||||
(position, subtoken). This allows one-to-many and many-to-one alignment.
|
||||
|
||||
For instance, let's say we align:
|
||||
|
||||
Cand: ['ab', 'c', 'd']
|
||||
Gold: ['a', 'b', 'cd']
|
||||
|
||||
The cand_to_gold alignment would be:
|
||||
|
||||
[[0, 0], (2, 0), (2, 1)]
|
||||
|
||||
And the gold_to_cand alignment:
|
||||
|
||||
[(0, 0), (0, 1), [1, 2]]
|
||||
'''
|
||||
if cand_words == gold_words:
|
||||
alignment = numpy.arange(len(cand_words))
|
||||
return 0, alignment, alignment, {}, {}, []
|
||||
cand_words = [w.replace(' ', '') for w in cand_words]
|
||||
gold_words = [w.replace(' ', '') for w in gold_words]
|
||||
cost, i2j, j2i, matrix = levenshtein_align(cand_words, gold_words)
|
||||
|
||||
i_lengths = [len(w) for w in cand_words]
|
||||
j_lengths = [len(w) for w in gold_words]
|
||||
|
||||
cand2gold, gold2cand = multi_align(i2j, j2i, i_lengths, j_lengths)
|
||||
return cost, cand2gold, gold2cand
|
||||
|
||||
@staticmethod
|
||||
def flatten(heads):
|
||||
'''Let's say we have a heads array with fused tokens. We might have
|
||||
something like:
|
||||
|
||||
[[(0, 1), 1], 1]
|
||||
|
||||
This indicates that token 0 aligns to two gold tokens. The head of the
|
||||
first subtoken is the second subtoken. The head of the second subtoken
|
||||
is the second token.
|
||||
|
||||
So we expand to a tree:
|
||||
|
||||
[1, 2, 2]
|
||||
|
||||
This is helpful for preventing other functions from knowing our weird
|
||||
format.
|
||||
'''
|
||||
# Get an alignment -- normalize to the more complicated format; so
|
||||
# if we have an int i, treat it as [(i, 0)]
|
||||
j = 0
|
||||
alignment = {(None, 0): None}
|
||||
for i, tokens in enumerate(heads):
|
||||
if not isinstance(tokens, list):
|
||||
alignment[(i, 0)] = j
|
||||
j += 1
|
||||
else:
|
||||
for sub_i in range(len(tokens)):
|
||||
alignment[(i, sub_i)] = j
|
||||
j += 1
|
||||
# Apply the alignment to get the new values
|
||||
new = []
|
||||
for head_vals in heads:
|
||||
if not isinstance(head_vals, list):
|
||||
head_vals = [(head_vals, 0)]
|
||||
for head_val in head_vals:
|
||||
if not isinstance(head_val, tuple):
|
||||
head_val = (head_val, 0)
|
||||
new.append(alignment[head_val])
|
||||
return new
|
||||
|
||||
|
||||
def levenshtein_align(S, T):
|
||||
cdef int m = len(S)
|
||||
cdef int n = len(T)
|
||||
cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
|
||||
|
@ -128,9 +250,39 @@ def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
|
|||
'''
|
||||
i2j_miss = _get_regions(i2j, i_lengths)
|
||||
j2i_miss = _get_regions(j2i, j_lengths)
|
||||
|
||||
i2j_many2one = _get_many2one(i2j_miss, j2i_miss, i_lengths, j_lengths)
|
||||
j2i_many2one = _get_many2one(j2i_miss, i2j_miss, j_lengths, i_lengths)
|
||||
i2j_one2many = _get_one2many(j2i_many2one)
|
||||
j2i_one2many = _get_one2many(i2j_many2one)
|
||||
i2j_one2part = _get_one2part(j2i_many2one)
|
||||
j2i_one2part = _get_one2part(i2j_many2one)
|
||||
|
||||
i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
|
||||
return i2j_multi, j2i_multi
|
||||
# Now get the more usable format we'll return
|
||||
cand2gold = _convert_multi_align(i2j, i2j_many2one, i2j_one2many, i2j_one2part)
|
||||
gold2cand = _convert_multi_align(j2i, j2i_many2one, j2i_one2many, j2i_one2part)
|
||||
return cand2gold, gold2cand
|
||||
|
||||
|
||||
def _convert_multi_align(one2one, many2one, one2many, one2part):
|
||||
output = []
|
||||
seen_j = Counter()
|
||||
for i, j in enumerate(one2one):
|
||||
if j != -1:
|
||||
output.append(j)
|
||||
elif i in many2one:
|
||||
j = many2one[i]
|
||||
output.append((j, seen_j[j]))
|
||||
seen_j[j] += 1
|
||||
elif i in one2many:
|
||||
output.append([])
|
||||
for j in one2many[i]:
|
||||
output[-1].append(j)
|
||||
elif i in one2part:
|
||||
output.append(one2part[i])
|
||||
else:
|
||||
output.append(None)
|
||||
return output
|
||||
|
||||
|
||||
def _get_regions(alignment, lengths):
|
||||
|
@ -149,9 +301,10 @@ def _get_regions(alignment, lengths):
|
|||
return regions
|
||||
|
||||
|
||||
def _get_mapping(miss1, miss2, lengths1, lengths2):
|
||||
i2j = {}
|
||||
j2i = {}
|
||||
def _get_many2one(miss1, miss2, lengths1, lengths2):
|
||||
miss1 = copy.deepcopy(miss1)
|
||||
miss2 = copy.deepcopy(miss2)
|
||||
i2j_many2one = {}
|
||||
for start, region1 in miss1.items():
|
||||
if not region1 or start not in miss2:
|
||||
continue
|
||||
|
@ -166,8 +319,7 @@ def _get_mapping(miss1, miss2, lengths1, lengths2):
|
|||
buff.append(region1.pop(0))
|
||||
if sum(lengths1[i] for i in buff) == lengths2[j]:
|
||||
for i in buff:
|
||||
i2j[i] = j
|
||||
j2i[j] = buff[-1]
|
||||
i2j_many2one[i] = j
|
||||
j += 1
|
||||
buff = []
|
||||
elif sum(lengths1[i] for i in buff) > lengths2[j]:
|
||||
|
@ -175,11 +327,25 @@ def _get_mapping(miss1, miss2, lengths1, lengths2):
|
|||
else:
|
||||
if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
|
||||
for i in buff:
|
||||
i2j[i] = j
|
||||
j2i[j] = buff[-1]
|
||||
return i2j, j2i
|
||||
i2j_many2one[i] = j
|
||||
return i2j_many2one
|
||||
|
||||
|
||||
def _get_one2many(many2one):
|
||||
one2many = {}
|
||||
for j, i in many2one.items():
|
||||
one2many.setdefault(i, []).append(j)
|
||||
return one2many
|
||||
|
||||
|
||||
def _get_one2part(many2one):
|
||||
one2part = {}
|
||||
seen_j = Counter()
|
||||
for i, j in many2one.items():
|
||||
one2part[i] = (j, seen_j[j])
|
||||
seen_j[j] += 1
|
||||
return one2part
|
||||
|
||||
def _convert_sequence(seq):
|
||||
if isinstance(seq, numpy.ndarray):
|
||||
return numpy.ascontiguousarray(seq, dtype='uint32_t')
|
||||
|
|
150
spacy/gold.pyx
150
spacy/gold.pyx
|
@ -11,10 +11,11 @@ import tempfile
|
|||
import shutil
|
||||
from pathlib import Path
|
||||
import msgpack
|
||||
from collections import Counter
|
||||
|
||||
import ujson
|
||||
|
||||
from . import _align
|
||||
from ._align import Alignment
|
||||
from .syntax import nonproj
|
||||
from .tokens import Doc
|
||||
from . import util
|
||||
|
@ -23,6 +24,7 @@ from .compat import json_dumps
|
|||
|
||||
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
entities = []
|
||||
start = None
|
||||
|
@ -68,32 +70,6 @@ def merge_sents(sents):
|
|||
return [(m_deps, m_brackets)]
|
||||
|
||||
|
||||
punct_re = re.compile(r'\W')
|
||||
def align(cand_words, gold_words):
|
||||
if cand_words == gold_words:
|
||||
alignment = numpy.arange(len(cand_words))
|
||||
return 0, alignment, alignment, {}, {}, []
|
||||
cand_words = [w.replace(' ', '') for w in cand_words]
|
||||
gold_words = [w.replace(' ', '') for w in gold_words]
|
||||
cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
|
||||
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words],
|
||||
[len(w) for w in gold_words])
|
||||
for i, j in list(i2j_multi.items()):
|
||||
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
|
||||
i2j[i] = j
|
||||
i2j_multi.pop(i)
|
||||
for j, i in list(j2i_multi.items()):
|
||||
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
|
||||
j2i[j] = i
|
||||
j2i_multi.pop(j)
|
||||
reverse_j2i, reverse_i2j = _align.multi_align(j2i, i2j, [len(w) for w in gold_words],
|
||||
[len(w) for w in cand_words])
|
||||
undersegmented = {}
|
||||
for j, i in reverse_j2i.items():
|
||||
undersegmented.setdefault(i, []).append(j)
|
||||
return cost, i2j, j2i, i2j_multi, j2i_multi, undersegmented
|
||||
|
||||
|
||||
class GoldCorpus(object):
|
||||
"""An annotated corpus, using the JSON file format. Manages
|
||||
annotations for tagging, dependency parsing and NER."""
|
||||
|
@ -385,47 +361,6 @@ def _consume_ent(tags):
|
|||
return [start] + middle + [end]
|
||||
|
||||
|
||||
def _flatten_fused_heads(heads):
|
||||
'''Let's say we have a heads array with fused tokens. We might have
|
||||
something like:
|
||||
|
||||
[[(0, 1), 1], 1]
|
||||
|
||||
This indicates that token 0 aligns to two gold tokens. The head of the
|
||||
first subtoken is the second subtoken. The head of the second subtoken
|
||||
is the second token.
|
||||
|
||||
So we expand to a tree:
|
||||
|
||||
[1, 2, 2]
|
||||
|
||||
This is helpful for preventing other functions from knowing our weird
|
||||
format.
|
||||
'''
|
||||
# Get an alignment -- normalize to the more complicated format; so
|
||||
# if we have an int i, treat it as [(i, 0)]
|
||||
j = 0
|
||||
alignment = {(None, 0): None}
|
||||
for i, tokens in enumerate(heads):
|
||||
if not isinstance(tokens, list):
|
||||
alignment[(i, 0)] = j
|
||||
j += 1
|
||||
else:
|
||||
for sub_i in range(len(tokens)):
|
||||
alignment[(i, sub_i)] = j
|
||||
j += 1
|
||||
# Apply the alignment to get the new values
|
||||
new = []
|
||||
for head_vals in heads:
|
||||
if not isinstance(head_vals, list):
|
||||
head_vals = [(head_vals, 0)]
|
||||
for head_val in head_vals:
|
||||
if not isinstance(head_val, tuple):
|
||||
head_val = (head_val, 0)
|
||||
new.append(alignment[head_val])
|
||||
return new
|
||||
|
||||
|
||||
cdef class GoldParse:
|
||||
"""Collection for training annotations."""
|
||||
@classmethod
|
||||
|
@ -508,80 +443,30 @@ cdef class GoldParse:
|
|||
# sequence of gold words.
|
||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||
# a sequence of gold words. That's many-to-many -- we don't do that.
|
||||
cost, i2j, j2i, i2j_multi, j2i_multi, undersegmented = align([t.orth_ for t in doc], words)
|
||||
|
||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||
self._alignment = Alignment([t.orth_ for t in doc], words)
|
||||
|
||||
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
|
||||
self.orig_annot = list(zip(*annot_tuples))
|
||||
|
||||
for i, gold_i in enumerate(self.cand_to_gold):
|
||||
self.words = self._alignment.to_yours(words)
|
||||
self.tags = self._alignment.to_yours(tags)
|
||||
self.labels = self._alignment.to_yours(deps)
|
||||
self.tags = self._alignment.to_yours(tags)
|
||||
self.ner = self._alignment.to_yours(entities)
|
||||
|
||||
aligned_heads = [self._alignment.index_to_yours(h) for h in heads]
|
||||
self.heads = self._alignment.to_yours(aligned_heads)
|
||||
|
||||
for i in range(len(doc)):
|
||||
# Fix spaces
|
||||
if doc[i].text.isspace():
|
||||
self.words[i] = doc[i].text
|
||||
self.tags[i] = '_SP'
|
||||
self.heads[i] = None
|
||||
self.labels[i] = None
|
||||
self.ner[i] = 'O'
|
||||
if gold_i is None:
|
||||
if i in undersegmented:
|
||||
self.words[i] = [words[j] for j in undersegmented[i]]
|
||||
self.tags[i] = [tags[j] for j in undersegmented[i]]
|
||||
self.labels[i] = [deps[j] for j in undersegmented[i]]
|
||||
self.ner[i] = [entities[j] for j in undersegmented[i]]
|
||||
self.heads[i] = []
|
||||
for h in [heads[j] for j in undersegmented[i]]:
|
||||
if heads[h] is None:
|
||||
self.heads[i].append(None)
|
||||
else:
|
||||
self.heads[i].append(self.gold_to_cand[heads[h]])
|
||||
elif i in i2j_multi:
|
||||
self.words[i] = words[i2j_multi[i]]
|
||||
self.tags[i] = tags[i2j_multi[i]]
|
||||
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
||||
is_first = i2j_multi[i] != i2j_multi.get(i-1)
|
||||
# Set next word in multi-token span as head, until last
|
||||
if not is_last:
|
||||
self.heads[i] = i+1
|
||||
self.labels[i] = 'subtok'
|
||||
else:
|
||||
self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
|
||||
self.labels[i] = deps[i2j_multi[i]]
|
||||
# Now set NER...This is annoying because if we've split
|
||||
# got an entity word split into two, we need to adjust the
|
||||
# BILOU tags. We can't have BB or LL etc.
|
||||
# Case 1: O -- easy.
|
||||
ner_tag = entities[i2j_multi[i]]
|
||||
if ner_tag is not None:
|
||||
if ner_tag == 'O':
|
||||
self.ner[i] = 'O'
|
||||
# Case 2: U. This has to become a B I* L sequence.
|
||||
elif ner_tag.startswith('U-'):
|
||||
if is_first:
|
||||
self.ner[i] = ner_tag.replace('U-', 'B-', 1)
|
||||
elif is_last:
|
||||
self.ner[i] = ner_tag.replace('U-', 'L-', 1)
|
||||
else:
|
||||
self.ner[i] = ner_tag.replace('U-', 'I-', 1)
|
||||
# Case 3: L. If not last, change to I.
|
||||
elif ner_tag.startswith('L-'):
|
||||
if is_last:
|
||||
self.ner[i] = ner_tag
|
||||
else:
|
||||
self.ner[i] = ner_tag.replace('L-', 'I-', 1)
|
||||
# Case 4: I. Stays correct
|
||||
elif ner_tag.startswith('I-'):
|
||||
self.ner[i] = ner_tag
|
||||
else:
|
||||
self.words[i] = words[gold_i]
|
||||
self.tags[i] = tags[gold_i]
|
||||
if heads[gold_i] is None:
|
||||
self.heads[i] = None
|
||||
else:
|
||||
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
||||
self.labels[i] = deps[gold_i]
|
||||
self.ner[i] = entities[gold_i]
|
||||
cycle = nonproj.contains_cycle(_flatten_fused_heads(self.heads))
|
||||
|
||||
cycle = nonproj.contains_cycle(self._alignment.flatten(self.heads))
|
||||
if cycle is not None:
|
||||
raise Exception("Cycle found: %s" % cycle)
|
||||
|
||||
|
@ -690,3 +575,4 @@ def offsets_from_biluo_tags(doc, tags):
|
|||
|
||||
def is_punct_label(label):
|
||||
return label == 'P' or label.lower() == 'punct'
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user