mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
95 lines
2.9 KiB
Cython
95 lines
2.9 KiB
Cython
|
from libc.string cimport memset
|
||
|
|
||
|
|
||
|
cdef class NERAnnotation:
|
||
|
def __init__(self, entities, length, entity_types):
|
||
|
self.mem = Pool()
|
||
|
self.starts = <int*>self.mem.alloc(length, sizeof(int))
|
||
|
self.ends = <int*>self.mem.alloc(length, sizeof(int))
|
||
|
self.labels = <int*>self.mem.alloc(length, sizeof(int))
|
||
|
self.entities = entities
|
||
|
memset(self.starts, -1, sizeof(int) * length)
|
||
|
memset(self.ends, -1, sizeof(int) * length)
|
||
|
memset(self.labels, -1, sizeof(int) * length)
|
||
|
|
||
|
cdef int start, end, label
|
||
|
for start, end, label in entities:
|
||
|
for i in range(start, end):
|
||
|
self.starts[i] = start
|
||
|
self.ends[i] = end
|
||
|
self.labels[i] = label
|
||
|
|
||
|
@classmethod
|
||
|
def from_bilous(cls, tag_strs, entity_types):
|
||
|
entities = []
|
||
|
start = None
|
||
|
for i, tag_str in enumerate(tag_strs):
|
||
|
if tag_str == 'O' or tag_str == '-':
|
||
|
continue
|
||
|
move, label_str = tag_str.split('-')
|
||
|
label = entity_types.index(label_str)
|
||
|
if label == -1:
|
||
|
label = len(entity_types)
|
||
|
entity_types.append(label)
|
||
|
if move == 'U':
|
||
|
assert start is None
|
||
|
entities.append((i, i+1, label))
|
||
|
elif move == 'B':
|
||
|
assert start is None
|
||
|
start = i
|
||
|
elif move == 'L':
|
||
|
assert start is not None
|
||
|
entities.append((start, i+1, label))
|
||
|
start = None
|
||
|
return cls(entities, len(tag_strs), entity_types)
|
||
|
|
||
|
|
||
|
|
||
|
def read_iob(file_, entity_types, create_tokens):
|
||
|
sent_strs = file_.read().strip().split('\n\n')
|
||
|
sents = []
|
||
|
for sent_str in sent_strs:
|
||
|
if sent_str.startswith('-DOCSTART-'):
|
||
|
continue
|
||
|
words = []
|
||
|
iob = []
|
||
|
for token_str in sent_str.split('\n'):
|
||
|
word, pos, chunk, ner = token_str.split()
|
||
|
words.append(word)
|
||
|
iob.append(ner)
|
||
|
bilou = iob_to_bilou(iob)
|
||
|
tokens = create_tokens(words)
|
||
|
sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types)))
|
||
|
return sents
|
||
|
|
||
|
|
||
|
def iob_to_bilou(tags):
|
||
|
out = []
|
||
|
curr_label = None
|
||
|
tags = list(tags)
|
||
|
while tags:
|
||
|
out.extend(_consume_os(tags))
|
||
|
out.extend(_consume_ent(tags))
|
||
|
return out
|
||
|
|
||
|
def _consume_os(tags):
|
||
|
while tags and tags[0] == 'O':
|
||
|
yield tags.pop(0)
|
||
|
|
||
|
def _consume_ent(tags):
|
||
|
if not tags:
|
||
|
return []
|
||
|
target = tags.pop(0).replace('B', 'I')
|
||
|
length = 1
|
||
|
while tags and tags[0] == target:
|
||
|
length += 1
|
||
|
tags.pop(0)
|
||
|
label = target[2:]
|
||
|
if length == 1:
|
||
|
return ['U-' + label]
|
||
|
else:
|
||
|
start = 'B-' + label
|
||
|
end = 'L-' + label
|
||
|
middle = ['I-%s' % label for _ in range(1, length - 1)]
|
||
|
return [start] + middle + [end]
|