From 3b0b902384b09654fe3219cd9bb8182773157b99 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Nov 2014 23:21:09 +1100 Subject: [PATCH] * IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 --- spacy/ner/_state.pyx | 2 +- spacy/ner/annot.pxd | 8 ++ spacy/ner/annot.pyx | 94 +++++++++++++++++++++++ spacy/ner/feats.pxd | 0 spacy/ner/feats.pyx | 99 ++++++++++++++++++++++++ spacy/ner/greedy_parser.pxd | 8 +- spacy/ner/greedy_parser.pyx | 91 +++++++++++++--------- spacy/ner/io_moves.pxd | 26 +++++++ spacy/ner/io_moves.pyx | 149 ++++++++++++++++++++++++++++++++++++ spacy/ner/structs.pxd | 23 ++++++ 10 files changed, 461 insertions(+), 39 deletions(-) create mode 100644 spacy/ner/annot.pxd create mode 100644 spacy/ner/annot.pyx create mode 100644 spacy/ner/feats.pxd create mode 100644 spacy/ner/feats.pyx create mode 100644 spacy/ner/io_moves.pxd create mode 100644 spacy/ner/io_moves.pyx create mode 100644 spacy/ner/structs.pxd diff --git a/spacy/ner/_state.pyx b/spacy/ner/_state.pyx index 660a0642f..7f1892371 100644 --- a/spacy/ner/_state.pyx +++ b/spacy/ner/_state.pyx @@ -7,7 +7,7 @@ cdef int begin_entity(State* s, label) except -1: cdef int end_entity(State* s) except -1: - s.curr.end = s.i + 1 + s.curr.end = s.i s.ents[s.j] = s.curr s.j += 1 s.curr.start = 0 diff --git a/spacy/ner/annot.pxd b/spacy/ner/annot.pxd new file mode 100644 index 000000000..b1b49d64f --- /dev/null +++ b/spacy/ner/annot.pxd @@ -0,0 +1,8 @@ +from cymem.cymem cimport Pool + +cdef class NERAnnotation: + cdef Pool mem + cdef int* starts + cdef int* ends + cdef int* labels + cdef readonly list entities diff --git a/spacy/ner/annot.pyx b/spacy/ner/annot.pyx new file mode 100644 index 000000000..d04345319 --- /dev/null +++ b/spacy/ner/annot.pyx @@ -0,0 +1,94 @@ +from libc.string cimport memset + + +cdef class NERAnnotation: + def __init__(self, entities, length, entity_types): + self.mem = Pool() + self.starts = self.mem.alloc(length, sizeof(int)) + self.ends = self.mem.alloc(length, sizeof(int)) + self.labels = self.mem.alloc(length, sizeof(int)) + self.entities = entities + memset(self.starts, -1, sizeof(int) * length) + memset(self.ends, -1, sizeof(int) * length) + memset(self.labels, -1, sizeof(int) * length) + + cdef int start, end, label + for start, end, label in entities: + for i in range(start, end): + self.starts[i] = start + self.ends[i] = end + self.labels[i] = label + + @classmethod + def from_bilous(cls, tag_strs, entity_types): + entities = [] + start = None + for i, tag_str in enumerate(tag_strs): + if tag_str == 'O' or tag_str == '-': + continue + move, label_str = tag_str.split('-') + label = entity_types.index(label_str) + if label == -1: + label = len(entity_types) + entity_types.append(label) + if move == 'U': + assert start is None + entities.append((i, i+1, label)) + elif move == 'B': + assert start is None + start = i + elif move == 'L': + assert start is not None + entities.append((start, i+1, label)) + start = None + return cls(entities, len(tag_strs), entity_types) + + + +def read_iob(file_, entity_types, create_tokens): + sent_strs = file_.read().strip().split('\n\n') + sents = [] + for sent_str in sent_strs: + if sent_str.startswith('-DOCSTART-'): + continue + words = [] + iob = [] + for token_str in sent_str.split('\n'): + word, pos, chunk, ner = token_str.split() + words.append(word) + iob.append(ner) + bilou = iob_to_bilou(iob) + tokens = create_tokens(words) + sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types))) + return sents + + +def iob_to_bilou(tags): + out = [] + curr_label = None + tags = list(tags) + while tags: + out.extend(_consume_os(tags)) + out.extend(_consume_ent(tags)) + return out + +def _consume_os(tags): + while tags and tags[0] == 'O': + yield tags.pop(0) + +def _consume_ent(tags): + if not tags: + return [] + target = tags.pop(0).replace('B', 'I') + length = 1 + while tags and tags[0] == target: + length += 1 + tags.pop(0) + label = target[2:] + if length == 1: + return ['U-' + label] + else: + start = 'B-' + label + end = 'L-' + label + middle = ['I-%s' % label for _ in range(1, length - 1)] + return [start] + middle + [end] diff --git a/spacy/ner/feats.pxd b/spacy/ner/feats.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/ner/feats.pyx b/spacy/ner/feats.pyx new file mode 100644 index 000000000..60910f235 --- /dev/null +++ b/spacy/ner/feats.pyx @@ -0,0 +1,99 @@ +from .context import * + + +LOCAL = ( + (W_sic,), + (P1_sic,), + (N1_sic,), + (P2_sic,), + (N2_sic,), + + (P1_sic, W_sic,), + (W_sic, N1_sic), + + (W_prefix,), + (W_suffix,), + + (P1_shape,), + (W_shape,), + (N1_shape,), + (P1_shape, W_shape,), + (W_shape, P1_shape,), + (P1_shape, W_shape, N1_shape), + (N2_shape,), + (P2_shape,), + + (P2_norm, P1_norm, W_norm), + (P1_norm, W_norm, N1_norm), + (W_norm, N1_norm, N2_norm) +) + +POS = ( + (P2_pos,), + (P1_pos,), + (W_pos,), + (N1_pos,), + (N2_pos,), + + (P1_pos, W_pos), + (W_pos, N1_pos), + (P2_pos, P1_pos, W_pos), + (P1_pos, W_pos, N1_pos), + (W_pos, N1_pos, N2_pos) +) + +CLUSTERS = ( + (P2_cluster,), + (P1_cluster,), + (W_cluster,), + (N1_cluster,), + (N2_cluster,), + + (P1_cluster, W_cluster), + (W_cluster, N1_cluster), +) + + +CLUSTER_POS = ( + (P1_cluster, W_pos), + (W_pos, P1_cluster), + (W_cluster, N1_pos), + (W_pos, N1_cluster) +) + + +STATE = ( + (E0_sic,), + (E0_cluster,), + (E0_pos,), + (E_last_sic,), + (E_last_cluster,), + (E_last_pos,), + + (E0_sic, W_sic), + (E0_cluster, W_cluster), + (E0_pos, W_pos), + (E_last_sic, W_sic), + (E_last_pos, W_pos), + + (E0_pos, E_last_pos, W_pos), + (E0_cluster, E_last_cluster, W_cluster), + + (E0_sic, E_last_sic), + (E0_pos, E_last_pos), + (E0_cluster, E_last_cluster), + (E0_pos, E_last_cluster), + (E0_cluster, E_last_pos), + + (E1_sic,), + (E1_cluster,), + (E1_pos,), + + (E0_sic, E1_sic), + (E0_sic, E1_pos,), + (E0_pos, E1_sic,), + (E0_pos, E1_pos), +) + + +TEMPLATES = LOCAL + CLUSTERS + POS + CLUSTER_POS + STATE diff --git a/spacy/ner/greedy_parser.pxd b/spacy/ner/greedy_parser.pxd index 0e8577161..9ee4d668d 100644 --- a/spacy/ner/greedy_parser.pxd +++ b/spacy/ner/greedy_parser.pxd @@ -6,7 +6,8 @@ from thinc.typedefs cimport * from ..tokens cimport Tokens from ..typedefs cimport * -from .bilou_moves cimport Move +from .structs cimport Move +from .annot cimport NERAnnotation cdef class NERParser: @@ -14,6 +15,7 @@ cdef class NERParser: cdef Extractor extractor cdef LinearModel model cdef readonly list tag_names + cdef readonly list entity_types cdef readonly int n_classes cdef Move* _moves @@ -23,5 +25,5 @@ cdef class NERParser: cdef weight_t* _scores - cpdef int train(self, Tokens tokens, golds) except -1 - cpdef int set_tags(self, Tokens tokens) except -1 + cpdef list train(self, Tokens tokens, NERAnnotation annot) + cpdef list set_tags(self, Tokens tokens) diff --git a/spacy/ner/greedy_parser.pyx b/spacy/ner/greedy_parser.pyx index c84975c79..5825c7539 100644 --- a/spacy/ner/greedy_parser.pyx +++ b/spacy/ner/greedy_parser.pyx @@ -12,40 +12,51 @@ from thinc.features cimport ConjFeat from .context cimport fill_context from .context cimport N_FIELDS -from .bilou_moves cimport Move -from .bilou_moves cimport fill_moves, transition, best_accepted -from .bilou_moves cimport set_accept_if_valid, set_accept_if_oracle -from ._state cimport entity_is_open -from .bilou_moves import get_n_moves -from ._state cimport State +from .structs cimport Move, State +from .io_moves cimport fill_moves, transition, best_accepted +from .io_moves cimport set_accept_if_valid, set_accept_if_oracle +from .io_moves import get_n_moves from ._state cimport init_state +from ._state cimport entity_is_open +from ._state cimport end_entity +from .annot cimport NERAnnotation -def setup_model_dir(tag_names, templates, model_dir): +def setup_model_dir(entity_types, templates, model_dir): if path.exists(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) config = { 'templates': templates, - 'tag_names': tag_names, + 'entity_types': entity_types, } with open(path.join(model_dir, 'config.json'), 'w') as file_: json.dump(config, file_) - def train(train_sents, model_dir, nr_iter=10): cdef Tokens tokens + cdef NERAnnotation gold_ner parser = NERParser(model_dir) for _ in range(nr_iter): - n_corr = 0 - total = 0 - for i, (tokens, golds) in enumerate(train_sents): - if any([g == 0 for g in golds]): - continue - n_corr += parser.train(tokens, golds) - total += len([g for g in golds if g != 0]) - print('%.4f' % ((n_corr / total) * 100)) + tp = 0 + fp = 0 + fn = 0 + for i, (tokens, gold_ner) in enumerate(train_sents): + #print [tokens[i].string for i in range(tokens.length)] + test_ents = set(parser.train(tokens, gold_ner)) + #print 'Test', test_ents + gold_ents = set(gold_ner.entities) + #print 'Gold', set(gold_ner.entities) + tp += len(gold_ents.intersection(test_ents)) + fp += len(test_ents - gold_ents) + fn += len(gold_ents - test_ents) + p = tp / (tp + fp) + r = tp / (tp + fn) + f = 2 * ((p * r) / (p + r)) + print 'P: %.3f' % p, + print 'R: %.3f' % r, + print 'F: %.3f' % f random.shuffle(train_sents) parser.model.end_training() parser.model.dump(path.join(model_dir, 'model')) @@ -56,11 +67,11 @@ cdef class NERParser: self.mem = Pool() cfg = json.load(open(path.join(model_dir, 'config.json'))) templates = cfg['templates'] - self.tag_names = cfg['tag_names'] self.extractor = Extractor(templates, [ConjFeat] * len(templates)) - self.n_classes = len(self.tag_names) - self._moves = self.mem.alloc(len(self.tag_names), sizeof(Move)) - fill_moves(self._moves, self.tag_names) + self.entity_types = cfg['entity_types'] + self.n_classes = get_n_moves(len(self.entity_types)) + self._moves = self.mem.alloc(self.n_classes, sizeof(Move)) + fill_moves(self._moves, self.n_classes, self.entity_types) self.model = LinearModel(self.n_classes) if path.exists(path.join(model_dir, 'model')): self.model.load(path.join(model_dir, 'model')) @@ -70,14 +81,11 @@ cdef class NERParser: self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) self._scores = self.mem.alloc(self.model.nr_class, sizeof(weight_t)) - cpdef int train(self, Tokens tokens, gold_classes) except -1: + cpdef list train(self, Tokens tokens, NERAnnotation annot): cdef Pool mem = Pool() cdef State* s = init_state(mem, tokens.length) - cdef Move* golds = mem.alloc(len(gold_classes), sizeof(Move)) - for tok_i, clas in enumerate(gold_classes): - golds[tok_i] = self._moves[clas] - assert golds[tok_i].clas == clas, '%d vs %d' % (golds[tok_i].clas, clas) cdef Move* guess + cdef Move* oracle_move n_correct = 0 cdef int f = 0 while s.i < tokens.length: @@ -88,23 +96,29 @@ cdef class NERParser: set_accept_if_valid(self._moves, self.n_classes, s) guess = best_accepted(self._moves, self._scores, self.n_classes) assert guess.clas != 0 - assert gold_classes[s.i] != 0 - set_accept_if_oracle(self._moves, golds, self.n_classes, s) - gold = best_accepted(self._moves, self._scores, self.n_classes) - if guess.clas == gold.clas: + set_accept_if_oracle(self._moves, self.n_classes, s, + annot.starts, annot.ends, annot.labels) + oracle_move = best_accepted(self._moves, self._scores, self.n_classes) + assert oracle_move.clas != 0 + if guess.clas == oracle_move.clas: counts = {} n_correct += 1 else: - counts = {guess.clas: {}, gold.clas: {}} - self.extractor.count(counts[gold.clas], self._feats, 1) + counts = {guess.clas: {}, oracle_move.clas: {}} + self.extractor.count(counts[oracle_move.clas], self._feats, 1) self.extractor.count(counts[guess.clas], self._feats, -1) self.model.update(counts) - gold_str = self.tag_names[gold.clas] transition(s, guess) tokens.ner[s.i-1] = s.tags[s.i-1] - return n_correct + if entity_is_open(s): + s.curr.label = annot.labels[s.curr.start] + end_entity(s) + entities = [] + for i in range(s.j): + entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label)) + return entities - cpdef int set_tags(self, Tokens tokens) except -1: + cpdef list set_tags(self, Tokens tokens): cdef Pool mem = Pool() cdef State* s = init_state(mem, tokens.length) cdef Move* move @@ -116,3 +130,10 @@ cdef class NERParser: move = best_accepted(self._moves, self._scores, self.n_classes) transition(s, move) tokens.ner[s.i-1] = s.tags[s.i-1] + if entity_is_open(s): + s.curr.label = move.label + end_entity(s) + entities = [] + for i in range(s.j): + entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label)) + return entities diff --git a/spacy/ner/io_moves.pxd b/spacy/ner/io_moves.pxd new file mode 100644 index 000000000..97f9512e8 --- /dev/null +++ b/spacy/ner/io_moves.pxd @@ -0,0 +1,26 @@ +from cymem.cymem cimport Pool + +from thinc.typedefs cimport class_t +from thinc.typedefs cimport weight_t + +from .structs cimport State, Move + + +cpdef enum ActionType: + MISSING + SHIFT + REDUCE + OUT + N_ACTIONS + + +cdef int set_accept_if_oracle(Move* moves, int n, State* s, + int* g_starts, int* g_ends, int* g_labels) except 0 + +cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0 + +cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL + +cdef int transition(State *s, Move* m) except -1 + +cdef int fill_moves(Move* moves, int n, list entity_types) except -1 diff --git a/spacy/ner/io_moves.pyx b/spacy/ner/io_moves.pyx new file mode 100644 index 000000000..6837ccef5 --- /dev/null +++ b/spacy/ner/io_moves.pyx @@ -0,0 +1,149 @@ +from __future__ import unicode_literals +from cymem.cymem cimport Pool + +from thinc.typedefs cimport class_t +from thinc.typedefs cimport weight_t + +from ._state cimport begin_entity +from ._state cimport end_entity +from ._state cimport entity_is_open + + +ACTION_NAMES = ['' for _ in range(N_ACTIONS)] +ACTION_NAMES[MISSING] = '?' +ACTION_NAMES[SHIFT] = 'S' +ACTION_NAMES[REDUCE] = 'R' +ACTION_NAMES[OUT] = 'O' + + +cdef int set_accept_if_oracle(Move* moves, int n, State* s, + int* g_starts, int* g_ends, int* g_labels) except 0: + # If curr entity: (O invalid) + # if cost is not sunk (start matches, end is i-1 or greater + # - If i-1 == gold.end --> R=True, S=False + # - Shift if end >= i --> S=True, R=False + # else + # - If i == gold.start --> R=True, S=False + # - Else --> R=True, S=True + # Else (R invalid): + # if start == gold.start: S=True, O=False + # else: O=True, S=False + if entity_is_open(s): + g_start = g_starts[s.curr.start] + g_end = g_ends[s.curr.start] + accept_o = False + if g_start == s.curr.start and g_end >= s.i: + if g_end == s.i: + accept_r = True + r_label = g_labels[s.curr.start] + accept_s = False + else: + accept_s = True + accept_r = False + else: + if g_starts[s.i] == s.i: + accept_r = True + r_label = 0 + accept_s = False + else: + accept_r = True + accept_s = True + r_label = 0 + else: + accept_r = False + if g_starts[s.i] == s.i: + accept_s = True + accept_o = False + else: + accept_o = True + accept_s = False + n_accept = 0 + moves[0].accept = False + for i in range(1, n): + m = &moves[i] + if m.action == SHIFT: + m.accept = accept_s + elif m.action == REDUCE: + m.accept = accept_r and (r_label == 0 or m.label == r_label) + elif m.action == OUT: + m.accept = accept_o + n_accept += m.accept + assert n_accept != 0 + return n_accept + + +cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0: + cdef int i + cdef bint open_ent = entity_is_open(s) + cdef int n_accept = 0 + moves[0].accept = False + for i in range(1, n): + if moves[i].action == SHIFT: + moves[i].accept = True + elif moves[i].action == REDUCE: + moves[i].accept = open_ent + elif moves[i].action == OUT: + moves[i].accept = not open_ent + n_accept += moves[i].accept + return n_accept + + +cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: + cdef int first_accept = -1 + for first_accept in range(1, n): + if moves[first_accept].accept: + break + else: + raise StandardError + assert first_accept != -1 + cdef int best = first_accept + cdef weight_t score = scores[first_accept-1] + cdef int i + for i in range(first_accept+1, n): + if moves[i].accept and scores[i-1] > score: + best = i + score = scores[i-1] + return &moves[best] + + +cdef int transition(State *s, Move* move) except -1: + s.tags[s.i] = move.clas + if move.action == OUT: + s.i += 1 + elif move.action == SHIFT: + if not entity_is_open(s): + begin_entity(s, 0) + s.i += 1 + elif move.action == REDUCE: + s.curr.label = move.label + end_entity(s) + else: + raise ValueError(move.action) + + +def get_n_moves(n_tags): + return 1 + 1 + 1 + n_tags + + +cdef int fill_moves(Move* moves, int n, list entity_types) except -1: + cdef Move* m + label_names = {'-': 0} + # Reserve class 0 + cdef int i = 0 + moves[i].clas = i + moves[i].action = MISSING + moves[i].label = 0 + i += 1 + moves[i].clas = i + moves[i].action = SHIFT + moves[i].label = 0 + i += 1 + moves[i].clas = i + moves[i].action = OUT + moves[i].label = 0 + i += 1 + for entity_type in entity_types: + moves[i].action = REDUCE + moves[i].label = label_names.setdefault(entity_type, len(label_names)) + moves[i].clas = i + i += 1 diff --git a/spacy/ner/structs.pxd b/spacy/ner/structs.pxd new file mode 100644 index 000000000..7d6ebed19 --- /dev/null +++ b/spacy/ner/structs.pxd @@ -0,0 +1,23 @@ +from thinc.typedefs cimport class_t + + +cdef struct Entity: + int start + int end + int label + + +cdef struct State: + Entity curr + Entity* ents + int* tags + int i + int j + int length + + +cdef struct Move: + class_t clas + int action + int label + bint accept