From 043b758cf4e3ad8450fe11fb9f63e7340cb5aa4c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 2 Feb 2015 16:37:25 +1100 Subject: [PATCH] * Resurrect old NER code. This version won't be the one that runs; we want to re-use the parser code. But for now this is a useful reference. --- spacy/ner/__init__.pxd | 0 spacy/ner/__init__.py | 0 spacy/ner/_feats.pxd | 0 spacy/ner/_feats.pyx | 169 +++++++++++++++++++++++++++++ spacy/ner/_state.pxd | 12 +++ spacy/ner/_state.pyx | 44 ++++++++ spacy/ner/annot.pxd | 8 ++ spacy/ner/annot.pyx | 94 ++++++++++++++++ spacy/ner/bilou_moves.pxd | 27 +++++ spacy/ner/bilou_moves.pyx | 207 ++++++++++++++++++++++++++++++++++++ spacy/ner/context.pxd | 153 ++++++++++++++++++++++++++ spacy/ner/context.pyx | 76 +++++++++++++ spacy/ner/feats.pxd | 0 spacy/ner/feats.pyx | 99 +++++++++++++++++ spacy/ner/greedy_parser.pxd | 29 +++++ spacy/ner/greedy_parser.pyx | 139 ++++++++++++++++++++++++ spacy/ner/io_moves.pxd | 26 +++++ spacy/ner/io_moves.pyx | 152 ++++++++++++++++++++++++++ spacy/ner/pystate.pxd | 16 +++ spacy/ner/pystate.pyx | 60 +++++++++++ spacy/ner/structs.pxd | 23 ++++ 21 files changed, 1334 insertions(+) create mode 100644 spacy/ner/__init__.pxd create mode 100644 spacy/ner/__init__.py create mode 100644 spacy/ner/_feats.pxd create mode 100644 spacy/ner/_feats.pyx create mode 100644 spacy/ner/_state.pxd create mode 100644 spacy/ner/_state.pyx create mode 100644 spacy/ner/annot.pxd create mode 100644 spacy/ner/annot.pyx create mode 100644 spacy/ner/bilou_moves.pxd create mode 100644 spacy/ner/bilou_moves.pyx create mode 100644 spacy/ner/context.pxd create mode 100644 spacy/ner/context.pyx create mode 100644 spacy/ner/feats.pxd create mode 100644 spacy/ner/feats.pyx create mode 100644 spacy/ner/greedy_parser.pxd create mode 100644 spacy/ner/greedy_parser.pyx create mode 100644 spacy/ner/io_moves.pxd create mode 100644 spacy/ner/io_moves.pyx create mode 100644 spacy/ner/pystate.pxd create mode 100644 spacy/ner/pystate.pyx create mode 100644 spacy/ner/structs.pxd diff --git a/spacy/ner/__init__.pxd b/spacy/ner/__init__.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/ner/__init__.py b/spacy/ner/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/ner/_feats.pxd b/spacy/ner/_feats.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/ner/_feats.pyx b/spacy/ner/_feats.pyx new file mode 100644 index 000000000..18e073c5b --- /dev/null +++ b/spacy/ner/_feats.pyx @@ -0,0 +1,169 @@ +from spacy.context cimport FIELD_IDS, Token + + +cdef Token P4 = FIELD_IDS.P4 +cdef Token P3 = FIELD_IDS.P3 +cdef Token P2 = FIELD_IDS.P2 +cdef Token P1 = FIELD_IDS.P1 +cdef Token N0 = FIELD_IDS.N0 +cdef Token N1 = FIELD_IDS.N1 +cdef Token N2 = FIELD_IDS.N2 +cdef Token N3 = FIELD_IDS.N3 +cdef Token N4 = FIELD_IDS.N4 + +""" +TEMPLATES = ( + (N0.sic,), + (N0.cluster,), + + (P1.pos,), + (P1.sic,), + + (N1.norm,), + (N1.pos,), + + (P1.ner,), + (P2.ner,), + + (N0.cluster,), + (P1.cluster,), + (N1.cluster,), + + (N0.is_alpha,), + (N0.is_digit,), + (N0.is_title,), + (N0.is_upper,), + + (N0.is_title, N0.oft_title), + (N0.is_upper, N0.oft_upper), + + (P1.cluster, N0.norm), + (N0.norm, N1.cluster), + + (P1.ner, N0.pos), + (P2.ner, P1.ner, N0.pos), + + (P2.pos, P1.pos, N0.sic), + (N0.sic, N1.pos, N2.pos) +) +""" + +LOCAL = ( + (N0.sic,), + (P1.sic,), + (N1.sic,), + (P2.sic,), + (N2.sic,), + (P3.sic,), + (N3.sic,), + (P4.sic,), + (N4.sic,), + + (P1.sic, N0.sic,), + (N0.sic, N1.sic), + + (N0.prefix,), + (N0.suffix,), + + (P1.shape,), + (N0.shape,), + (N1.shape,), + (P1.shape, N0.shape,), + (N0.shape, P1.shape,), + (P1.shape, N0.shape, N1.shape), + (N2.shape,), + (P2.shape,), + (P3.shape,), + (N3.shape,), + (P4.shape,), + (N4.shape,), + + (P2.norm, P1.norm, N0.norm), + (P1.norm, N0.norm, N1.norm), + (N0.norm, N1.norm, N2.norm) +) + +BOOLS = ( + (N0.is_title,), +) + + +HISTORY = ( + (P1.ner,), + (P1.ner, N0.sic,), + (P2.ner,), + (P2.ner, P1.ner), + (P2.ner, P1.ner, N0.sic), + (P2.pos, P1.ner, N0.pos), + (P2.ner, P1.pos, N0.pos), + (P3.ner,), + (P4.ner,), +) + +POS = ( + (P4.pos,), + (P3.pos,), + (P2.pos,), + (P1.pos,), + (N0.pos,), + (N1.pos,), + (N2.pos,), + (N3.pos,), + (N4.pos,), + + (P1.pos, N0.pos), + (N0.pos, N1.pos), + (P2.pos, P1.pos, N0.pos), + (P1.pos, N0.pos, N1.pos), + (N0.pos, N1.pos, N2.pos) +) + +CLUSTERS = ( + (P4.cluster,), + (P3.cluster,), + (P2.cluster,), + (P1.cluster,), + (N0.cluster,), + (N1.cluster,), + (N2.cluster,), + (N3.cluster,), + (N4.cluster,), + + (P1.cluster, N0.cluster), + (N0.cluster, N1.cluster), +) + + +CLUSTER_POS = ( + (P1.cluster, N0.pos), + (N0.pos, P1.cluster), + (N0.cluster, N1.pos), + (N0.pos, N1.cluster) +) + + +GAZ = ( + (N0.in_males,), + (N0.in_females,), + (N0.in_surnames,), + (N0.in_places,), + (N0.in_games,), + (N0.in_celebs,), + (N0.in_names,), + (P1.in_males,), + (P1.in_females,), + (P1.in_surnames,), + (P1.in_places,), + (P1.in_games,), + (P1.in_celebs,), + (P1.in_names,), + (N1.in_males,), + (N1.in_females,), + (N1.in_surnames,), + (N1.in_places,), + (N1.in_games,), + (N1.in_celebs,), + (N1.in_names,), +) + +TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS diff --git a/spacy/ner/_state.pxd b/spacy/ner/_state.pxd new file mode 100644 index 000000000..43b37d3bd --- /dev/null +++ b/spacy/ner/_state.pxd @@ -0,0 +1,12 @@ +from cymem.cymem cimport Pool +from .structs cimport State, Entity, Move + +cdef int begin_entity(State* s, label) except -1 + +cdef int end_entity(State* s) except -1 + +cdef State* init_state(Pool mem, int sent_length) except NULL + +cdef bint entity_is_open(State *s) except -1 + +cdef bint entity_is_sunk(State *s, Move* golds) except -1 diff --git a/spacy/ner/_state.pyx b/spacy/ner/_state.pyx new file mode 100644 index 000000000..7f1892371 --- /dev/null +++ b/spacy/ner/_state.pyx @@ -0,0 +1,44 @@ +from .bilou_moves cimport BEGIN, UNIT + + +cdef int begin_entity(State* s, label) except -1: + s.curr.start = s.i + s.curr.label = label + + +cdef int end_entity(State* s) except -1: + s.curr.end = s.i + s.ents[s.j] = s.curr + s.j += 1 + s.curr.start = 0 + s.curr.label = -1 + s.curr.end = 0 + + +cdef State* init_state(Pool mem, int sent_length) except NULL: + s = mem.alloc(1, sizeof(State)) + s.j = 0 + s.ents = mem.alloc(sent_length, sizeof(Entity)) + for i in range(sent_length): + s.ents[i].label = -1 + s.curr.label = -1 + s.tags = mem.alloc(sent_length, sizeof(int)) + s.length = sent_length + return s + + +cdef bint entity_is_open(State *s) except -1: + return s.curr.label != -1 + + +cdef bint entity_is_sunk(State *s, Move* golds) except -1: + if not entity_is_open(s): + return False + + cdef Move* gold = &golds[s.curr.start] + if gold.action != BEGIN and gold.action != UNIT: + return True + elif gold.label != s.curr.label: + return True + else: + return False diff --git a/spacy/ner/annot.pxd b/spacy/ner/annot.pxd new file mode 100644 index 000000000..b1b49d64f --- /dev/null +++ b/spacy/ner/annot.pxd @@ -0,0 +1,8 @@ +from cymem.cymem cimport Pool + +cdef class NERAnnotation: + cdef Pool mem + cdef int* starts + cdef int* ends + cdef int* labels + cdef readonly list entities diff --git a/spacy/ner/annot.pyx b/spacy/ner/annot.pyx new file mode 100644 index 000000000..d04345319 --- /dev/null +++ b/spacy/ner/annot.pyx @@ -0,0 +1,94 @@ +from libc.string cimport memset + + +cdef class NERAnnotation: + def __init__(self, entities, length, entity_types): + self.mem = Pool() + self.starts = self.mem.alloc(length, sizeof(int)) + self.ends = self.mem.alloc(length, sizeof(int)) + self.labels = self.mem.alloc(length, sizeof(int)) + self.entities = entities + memset(self.starts, -1, sizeof(int) * length) + memset(self.ends, -1, sizeof(int) * length) + memset(self.labels, -1, sizeof(int) * length) + + cdef int start, end, label + for start, end, label in entities: + for i in range(start, end): + self.starts[i] = start + self.ends[i] = end + self.labels[i] = label + + @classmethod + def from_bilous(cls, tag_strs, entity_types): + entities = [] + start = None + for i, tag_str in enumerate(tag_strs): + if tag_str == 'O' or tag_str == '-': + continue + move, label_str = tag_str.split('-') + label = entity_types.index(label_str) + if label == -1: + label = len(entity_types) + entity_types.append(label) + if move == 'U': + assert start is None + entities.append((i, i+1, label)) + elif move == 'B': + assert start is None + start = i + elif move == 'L': + assert start is not None + entities.append((start, i+1, label)) + start = None + return cls(entities, len(tag_strs), entity_types) + + + +def read_iob(file_, entity_types, create_tokens): + sent_strs = file_.read().strip().split('\n\n') + sents = [] + for sent_str in sent_strs: + if sent_str.startswith('-DOCSTART-'): + continue + words = [] + iob = [] + for token_str in sent_str.split('\n'): + word, pos, chunk, ner = token_str.split() + words.append(word) + iob.append(ner) + bilou = iob_to_bilou(iob) + tokens = create_tokens(words) + sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types))) + return sents + + +def iob_to_bilou(tags): + out = [] + curr_label = None + tags = list(tags) + while tags: + out.extend(_consume_os(tags)) + out.extend(_consume_ent(tags)) + return out + +def _consume_os(tags): + while tags and tags[0] == 'O': + yield tags.pop(0) + +def _consume_ent(tags): + if not tags: + return [] + target = tags.pop(0).replace('B', 'I') + length = 1 + while tags and tags[0] == target: + length += 1 + tags.pop(0) + label = target[2:] + if length == 1: + return ['U-' + label] + else: + start = 'B-' + label + end = 'L-' + label + middle = ['I-%s' % label for _ in range(1, length - 1)] + return [start] + middle + [end] diff --git a/spacy/ner/bilou_moves.pxd b/spacy/ner/bilou_moves.pxd new file mode 100644 index 000000000..20ec58291 --- /dev/null +++ b/spacy/ner/bilou_moves.pxd @@ -0,0 +1,27 @@ +from cymem.cymem cimport Pool + +from thinc.typedefs cimport class_t +from thinc.typedefs cimport weight_t + +from .structs cimport State, Move + + +cpdef enum ActionType: + MISSING + BEGIN + IN + LAST + UNIT + OUT + N_ACTIONS + + +cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0 + +cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0 + +cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL + +cdef int transition(State *s, Move* m) except -1 + +cdef int fill_moves(Move* moves, list tag_names) except -1 diff --git a/spacy/ner/bilou_moves.pyx b/spacy/ner/bilou_moves.pyx new file mode 100644 index 000000000..42cef3fb7 --- /dev/null +++ b/spacy/ner/bilou_moves.pyx @@ -0,0 +1,207 @@ +from __future__ import unicode_literals + +from ._state cimport begin_entity +from ._state cimport end_entity +from ._state cimport entity_is_open +from ._state cimport entity_is_sunk + + +ACTION_NAMES = ['' for _ in range(N_ACTIONS)] +ACTION_NAMES[MISSING] = '?' +ACTION_NAMES[BEGIN] = 'B' +ACTION_NAMES[IN] = 'I' +ACTION_NAMES[LAST] = 'L' +ACTION_NAMES[UNIT] = 'U' +ACTION_NAMES[OUT] = 'O' + + +cdef bint can_begin(State* s, int label): + return not entity_is_open(s) + + +cdef bint can_in(State* s, int label): + return entity_is_open(s) and s.curr.label == label + + +cdef bint can_last(State* s, int label): + return entity_is_open(s) and s.curr.label == label + + +cdef bint can_unit(State* s, int label): + return not entity_is_open(s) + + +cdef bint can_out(State* s, int label): + return not entity_is_open(s) + + +cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag, + ActionType next_act, bint is_sunk): + if g_act == MISSING: + return True + if act == BEGIN: + if g_act == BEGIN: + # B, Gold B --> Label match + return tag == g_tag + else: + # B, Gold I --> False (P) + # B, Gold L --> False (P) + # B, Gold O --> False (P) + # B, Gold U --> False (P) + return False + elif act == IN: + if g_act == BEGIN: + # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk) + return True + elif g_act == IN: + # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk) + return True + elif g_act == LAST: + # I, Gold L --> True iff this entity sunk and next tag == O + return is_sunk and (next_act == OUT or next_act == MISSING) + elif g_act == OUT: + # I, Gold O --> True iff next tag == O + return next_act == OUT or next_act == MISSING + elif g_act == UNIT: + # I, Gold U --> True iff next tag == O + return next_act == OUT + elif act == LAST: + if g_act == BEGIN: + # L, Gold B --> True + return True + elif g_act == IN: + # L, Gold I --> True iff this entity sunk + return is_sunk + elif g_act == LAST: + # L, Gold L --> True + return True + elif g_act == OUT: + # L, Gold O --> True + return True + elif g_act == UNIT: + # L, Gold U --> True + return True + elif act == OUT: + if g_act == BEGIN: + # O, Gold B --> False + return False + elif g_act == IN: + # O, Gold I --> True + return True + elif g_act == LAST: + # O, Gold L --> True + return True + elif g_act == OUT: + # O, Gold O --> True + return True + elif g_act == UNIT: + # O, Gold U --> False + return False + elif act == UNIT: + if g_act == UNIT: + # U, Gold U --> True iff tag match + return tag == g_tag + else: + # U, Gold B --> False + # U, Gold I --> False + # U, Gold L --> False + # U, Gold O --> False + return False + + +cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0: + cdef int n_accept = 0 + cdef Move* m + moves[0].accept = False + for i in range(1, n_classes): + m = &moves[i] + if m.action == BEGIN: + m.accept = can_begin(s, m.label) + elif m.action == IN: + m.accept = can_in(s, m.label) + elif m.action == LAST: + m.accept = can_last(s, m.label) + elif m.action == UNIT: + m.accept = can_unit(s, m.label) + elif m.action == OUT: + m.accept = can_out(s, m.label) + n_accept += m.accept + assert n_accept != 0 + return n_accept + + +cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0: + + cdef Move* g = &golds[s.i] + cdef ActionType next_act = golds[s.i+1].action if s.i < s.length else OUT + cdef bint is_sunk = entity_is_sunk(s, golds) + cdef Move* m + cdef int n_accept = 0 + set_accept_if_valid(moves, n_classes, s) + for i in range(1, n_classes): + m = &moves[i] + if not m.accept: + continue + m.accept = is_oracle(m.action, m.label, g.action, + g.label, next_act, is_sunk) + n_accept += m.accept + assert n_accept != 0 + return n_accept + + +cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: + cdef int first_accept = -1 + for first_accept in range(1, n): + if moves[first_accept].accept: + break + else: + raise StandardError + assert first_accept != -1 + cdef int best = first_accept + cdef weight_t score = scores[first_accept-1] + cdef int i + for i in range(first_accept+1, n): + if moves[i].accept and scores[i-1] > score: + best = i + score = scores[i-1] + return &moves[best] + + +cdef int transition(State *s, Move* move) except -1: + if move.action == BEGIN: + begin_entity(s, move.label) + elif move.action == IN: + pass + elif move.action == LAST: + end_entity(s) + elif move.action == UNIT: + begin_entity(s, move.label) + end_entity(s) + elif move.action == OUT: + pass + s.tags[s.i] = move.clas + s.i += 1 + + +def get_n_moves(n_tags): + return n_tags + n_tags + n_tags + n_tags + 1 + + +cdef int fill_moves(Move* moves, list tag_names) except -1: + cdef Move* m + label_names = {'-': 0} + for i, tag_name in enumerate(tag_names): + m = &moves[i] + if '-' in tag_name: + action_str, label = tag_name.split('-') + elif tag_name == 'O': + action_str = 'O' + label = '-' + elif tag_name == 'NULL' or tag_name == 'EOL': + action_str = '?' + label = '-' + else: + raise StandardError(tag_name) + m.action = ACTION_NAMES.index(action_str) + m.label = label_names.setdefault(label, len(label_names)) + m.clas = i diff --git a/spacy/ner/context.pxd b/spacy/ner/context.pxd new file mode 100644 index 000000000..f9280c516 --- /dev/null +++ b/spacy/ner/context.pxd @@ -0,0 +1,153 @@ +from thinc.typedefs cimport atom_t +from ..typedefs cimport hash_t +from ..tokens cimport Tokens +from ..lexeme cimport Lexeme +from .structs cimport State + + +cpdef enum: + T_sic + T_cluster + T_norm + T_shape + T_asciied + T_prefix + T_suffix + T_length + T_postype + T_nertype + T_sensetype + T_is_alpha + T_is_ascii + T_is_digit + T_is_lower + T_is_punct + T_is_space + T_is_title + T_is_upper + T_like_url + T_like_number + T_oft_lower + T_oft_title + T_oft_upper + T_in_males + T_in_females + T_in_surnames + T_in_places + T_in_celebs + T_in_names + T_pos + T_sense + T_ner + + +cpdef enum: + P2_sic + P2_cluster + P2_norm + P2_shape + P2_prefix + P2_suffix + P2_length + P2_postype + P2_is_alpha + P2_is_digit + P2_is_lower + P2_is_punct + P2_is_title + P2_is_upper + P2_like_number + P2_pos + + P1_sic + P1_cluster + P1_norm + P1_shape + P1_prefix + P1_suffix + P1_length + P1_postype + P1_is_alpha + P1_is_digit + P1_is_lower + P1_is_punct + P1_is_title + P1_is_upper + P1_like_number + P1_pos + + W_sic + W_cluster + W_norm + W_shape + W_prefix + W_suffix + W_length + W_postype + W_is_alpha + W_is_digit + W_is_lower + W_is_punct + W_is_space + W_is_title + W_is_upper + W_like_number + W_pos + + N1_sic + N1_cluster + N1_norm + N1_shape + N1_prefix + N1_suffix + N1_length + N1_postype + N1_is_alpha + N1_is_ascii + N1_is_digit + N1_is_lower + N1_is_punct + N1_is_space + N1_is_title + N1_is_upper + N1_like_number + N1_pos + + N2_sic + N2_cluster + N2_norm + N2_shape + N2_asciied + N2_prefix + N2_suffix + N2_length + N2_postype + N2_is_alpha + N2_is_digit + N2_is_lower + N2_is_punct + N2_is_space + N2_is_title + N2_is_upper + N2_like_number + N2_pos + N2_sense + + E0_sic + E0_cluster + E0_pos + + E1_sic + E1_cluster + E1_pos + + E_last_sic + E_last_cluster + E_last_pos + + N_FIELDS + + +cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1 + + diff --git a/spacy/ner/context.pyx b/spacy/ner/context.pyx new file mode 100644 index 000000000..c062bb098 --- /dev/null +++ b/spacy/ner/context.pyx @@ -0,0 +1,76 @@ +from libc.string cimport memset + +from murmurhash.mrmr cimport hash64 +from ._state cimport entity_is_open +from ..lexeme cimport * + + +cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos): + c[T_sic] = lex.sic + c[T_cluster] = lex.cluster + c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape + c[T_shape] = lex.shape + c[T_asciied] = lex.asciied + c[T_prefix] = lex.prefix + c[T_suffix] = lex.suffix + c[T_length] = lex.length + + c[T_postype] = lex.postype + c[T_nertype] = 0 + c[T_sensetype] = 0 + + c[T_is_alpha] = lex.flags & (1 << IS_ALPHA) + c[T_is_digit] = lex.flags & (1 << IS_DIGIT) + c[T_is_lower] = lex.flags & (1 << IS_LOWER) + c[T_is_punct] = lex.flags & (1 << IS_PUNCT) + c[T_is_space] = lex.flags & (1 << IS_SPACE) + c[T_is_title] = lex.flags & (1 << IS_TITLE) + c[T_is_upper] = lex.flags & (1 << IS_UPPER) + c[T_like_url] = lex.flags & (1 << LIKE_URL) + c[T_like_number] = lex.flags & (1 << LIKE_NUMBER) + c[T_oft_lower] = lex.flags & (1 << OFT_LOWER) + c[T_oft_title] = lex.flags & (1 << OFT_TITLE) + c[T_oft_upper] = lex.flags & (1 << OFT_UPPER) + + c[T_in_males] = lex.flags & (1 << IN_MALES) + c[T_in_females] = lex.flags & (1 << IN_FEMALES) + c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES) + c[T_in_places] = lex.flags & (1 << IN_PLACES) + c[T_in_celebs] = lex.flags & (1 << IN_CELEBS) + c[T_in_names] = lex.flags & (1 << IN_NAMES) + + c[T_pos] = pos + c[T_sense] = 0 + + +cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos): + c[0] = lex.sic + c[1] = lex.cluster + c[2] = lex.shape + c[3] = pos + + +cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1: + cdef int i + for i in range(N_FIELDS): + context[i] = 0 + i = s.i + _fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2]) + _fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1]) + _fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i]) + _fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1]) + _fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2]) + + cdef atom_t[5] ent_vals + if entity_is_open(s): + context[E0_sic] = tokens.lex[s.curr.start].sic + context[E0_cluster] = tokens.lex[s.curr.start].cluster + context[E0_pos] = tokens.pos[s.curr.start] + context[E_last_sic] = tokens.lex[s.i-1].sic + context[E_last_cluster] = tokens.lex[s.i-1].cluster + context[E_last_pos] = tokens.pos[s.i-1] + if (s.curr.start + 1) < s.i: + context[E1_sic] = tokens.lex[s.curr.start+1].sic + context[E1_cluster] = tokens.lex[s.curr.start+1].cluster + context[E1_pos] = tokens.pos[s.curr.start+1] + return 1 diff --git a/spacy/ner/feats.pxd b/spacy/ner/feats.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/ner/feats.pyx b/spacy/ner/feats.pyx new file mode 100644 index 000000000..60910f235 --- /dev/null +++ b/spacy/ner/feats.pyx @@ -0,0 +1,99 @@ +from .context import * + + +LOCAL = ( + (W_sic,), + (P1_sic,), + (N1_sic,), + (P2_sic,), + (N2_sic,), + + (P1_sic, W_sic,), + (W_sic, N1_sic), + + (W_prefix,), + (W_suffix,), + + (P1_shape,), + (W_shape,), + (N1_shape,), + (P1_shape, W_shape,), + (W_shape, P1_shape,), + (P1_shape, W_shape, N1_shape), + (N2_shape,), + (P2_shape,), + + (P2_norm, P1_norm, W_norm), + (P1_norm, W_norm, N1_norm), + (W_norm, N1_norm, N2_norm) +) + +POS = ( + (P2_pos,), + (P1_pos,), + (W_pos,), + (N1_pos,), + (N2_pos,), + + (P1_pos, W_pos), + (W_pos, N1_pos), + (P2_pos, P1_pos, W_pos), + (P1_pos, W_pos, N1_pos), + (W_pos, N1_pos, N2_pos) +) + +CLUSTERS = ( + (P2_cluster,), + (P1_cluster,), + (W_cluster,), + (N1_cluster,), + (N2_cluster,), + + (P1_cluster, W_cluster), + (W_cluster, N1_cluster), +) + + +CLUSTER_POS = ( + (P1_cluster, W_pos), + (W_pos, P1_cluster), + (W_cluster, N1_pos), + (W_pos, N1_cluster) +) + + +STATE = ( + (E0_sic,), + (E0_cluster,), + (E0_pos,), + (E_last_sic,), + (E_last_cluster,), + (E_last_pos,), + + (E0_sic, W_sic), + (E0_cluster, W_cluster), + (E0_pos, W_pos), + (E_last_sic, W_sic), + (E_last_pos, W_pos), + + (E0_pos, E_last_pos, W_pos), + (E0_cluster, E_last_cluster, W_cluster), + + (E0_sic, E_last_sic), + (E0_pos, E_last_pos), + (E0_cluster, E_last_cluster), + (E0_pos, E_last_cluster), + (E0_cluster, E_last_pos), + + (E1_sic,), + (E1_cluster,), + (E1_pos,), + + (E0_sic, E1_sic), + (E0_sic, E1_pos,), + (E0_pos, E1_sic,), + (E0_pos, E1_pos), +) + + +TEMPLATES = LOCAL + CLUSTERS + POS + CLUSTER_POS + STATE diff --git a/spacy/ner/greedy_parser.pxd b/spacy/ner/greedy_parser.pxd new file mode 100644 index 000000000..9ee4d668d --- /dev/null +++ b/spacy/ner/greedy_parser.pxd @@ -0,0 +1,29 @@ +from cymem.cymem cimport Pool +from thinc.features cimport Extractor +from thinc.learner cimport LinearModel +from thinc.typedefs cimport * + +from ..tokens cimport Tokens +from ..typedefs cimport * + +from .structs cimport Move +from .annot cimport NERAnnotation + + +cdef class NERParser: + cdef Pool mem + cdef Extractor extractor + cdef LinearModel model + cdef readonly list tag_names + cdef readonly list entity_types + cdef readonly int n_classes + + cdef Move* _moves + cdef atom_t* _context + cdef feat_t* _feats + cdef weight_t* _values + cdef weight_t* _scores + + + cpdef list train(self, Tokens tokens, NERAnnotation annot) + cpdef list set_tags(self, Tokens tokens) diff --git a/spacy/ner/greedy_parser.pyx b/spacy/ner/greedy_parser.pyx new file mode 100644 index 000000000..5825c7539 --- /dev/null +++ b/spacy/ner/greedy_parser.pyx @@ -0,0 +1,139 @@ +from __future__ import division +from __future__ import unicode_literals + +cimport cython +import random +import os +from os import path +import shutil +import json + +from thinc.features cimport ConjFeat + +from .context cimport fill_context +from .context cimport N_FIELDS +from .structs cimport Move, State +from .io_moves cimport fill_moves, transition, best_accepted +from .io_moves cimport set_accept_if_valid, set_accept_if_oracle +from .io_moves import get_n_moves +from ._state cimport init_state +from ._state cimport entity_is_open +from ._state cimport end_entity +from .annot cimport NERAnnotation + + +def setup_model_dir(entity_types, templates, model_dir): + if path.exists(model_dir): + shutil.rmtree(model_dir) + os.mkdir(model_dir) + config = { + 'templates': templates, + 'entity_types': entity_types, + } + with open(path.join(model_dir, 'config.json'), 'w') as file_: + json.dump(config, file_) + + +def train(train_sents, model_dir, nr_iter=10): + cdef Tokens tokens + cdef NERAnnotation gold_ner + parser = NERParser(model_dir) + for _ in range(nr_iter): + tp = 0 + fp = 0 + fn = 0 + for i, (tokens, gold_ner) in enumerate(train_sents): + #print [tokens[i].string for i in range(tokens.length)] + test_ents = set(parser.train(tokens, gold_ner)) + #print 'Test', test_ents + gold_ents = set(gold_ner.entities) + #print 'Gold', set(gold_ner.entities) + tp += len(gold_ents.intersection(test_ents)) + fp += len(test_ents - gold_ents) + fn += len(gold_ents - test_ents) + p = tp / (tp + fp) + r = tp / (tp + fn) + f = 2 * ((p * r) / (p + r)) + print 'P: %.3f' % p, + print 'R: %.3f' % r, + print 'F: %.3f' % f + random.shuffle(train_sents) + parser.model.end_training() + parser.model.dump(path.join(model_dir, 'model')) + + +cdef class NERParser: + def __init__(self, model_dir): + self.mem = Pool() + cfg = json.load(open(path.join(model_dir, 'config.json'))) + templates = cfg['templates'] + self.extractor = Extractor(templates, [ConjFeat] * len(templates)) + self.entity_types = cfg['entity_types'] + self.n_classes = get_n_moves(len(self.entity_types)) + self._moves = self.mem.alloc(self.n_classes, sizeof(Move)) + fill_moves(self._moves, self.n_classes, self.entity_types) + self.model = LinearModel(self.n_classes) + if path.exists(path.join(model_dir, 'model')): + self.model.load(path.join(model_dir, 'model')) + + self._context = self.mem.alloc(N_FIELDS, sizeof(atom_t)) + self._feats = self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) + self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) + self._scores = self.mem.alloc(self.model.nr_class, sizeof(weight_t)) + + cpdef list train(self, Tokens tokens, NERAnnotation annot): + cdef Pool mem = Pool() + cdef State* s = init_state(mem, tokens.length) + cdef Move* guess + cdef Move* oracle_move + n_correct = 0 + cdef int f = 0 + while s.i < tokens.length: + fill_context(self._context, s, tokens) + self.extractor.extract(self._feats, self._values, self._context, NULL) + self.model.score(self._scores, self._feats, self._values) + + set_accept_if_valid(self._moves, self.n_classes, s) + guess = best_accepted(self._moves, self._scores, self.n_classes) + assert guess.clas != 0 + set_accept_if_oracle(self._moves, self.n_classes, s, + annot.starts, annot.ends, annot.labels) + oracle_move = best_accepted(self._moves, self._scores, self.n_classes) + assert oracle_move.clas != 0 + if guess.clas == oracle_move.clas: + counts = {} + n_correct += 1 + else: + counts = {guess.clas: {}, oracle_move.clas: {}} + self.extractor.count(counts[oracle_move.clas], self._feats, 1) + self.extractor.count(counts[guess.clas], self._feats, -1) + self.model.update(counts) + transition(s, guess) + tokens.ner[s.i-1] = s.tags[s.i-1] + if entity_is_open(s): + s.curr.label = annot.labels[s.curr.start] + end_entity(s) + entities = [] + for i in range(s.j): + entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label)) + return entities + + cpdef list set_tags(self, Tokens tokens): + cdef Pool mem = Pool() + cdef State* s = init_state(mem, tokens.length) + cdef Move* move + while s.i < tokens.length: + fill_context(self._context, s, tokens) + self.extractor.extract(self._feats, self._values, self._context, NULL) + self.model.score(self._scores, self._feats, self._values) + set_accept_if_valid(self._moves, self.n_classes, s) + move = best_accepted(self._moves, self._scores, self.n_classes) + transition(s, move) + tokens.ner[s.i-1] = s.tags[s.i-1] + if entity_is_open(s): + s.curr.label = move.label + end_entity(s) + entities = [] + for i in range(s.j): + entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label)) + return entities diff --git a/spacy/ner/io_moves.pxd b/spacy/ner/io_moves.pxd new file mode 100644 index 000000000..97f9512e8 --- /dev/null +++ b/spacy/ner/io_moves.pxd @@ -0,0 +1,26 @@ +from cymem.cymem cimport Pool + +from thinc.typedefs cimport class_t +from thinc.typedefs cimport weight_t + +from .structs cimport State, Move + + +cpdef enum ActionType: + MISSING + SHIFT + REDUCE + OUT + N_ACTIONS + + +cdef int set_accept_if_oracle(Move* moves, int n, State* s, + int* g_starts, int* g_ends, int* g_labels) except 0 + +cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0 + +cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL + +cdef int transition(State *s, Move* m) except -1 + +cdef int fill_moves(Move* moves, int n, list entity_types) except -1 diff --git a/spacy/ner/io_moves.pyx b/spacy/ner/io_moves.pyx new file mode 100644 index 000000000..dc268e4a5 --- /dev/null +++ b/spacy/ner/io_moves.pyx @@ -0,0 +1,152 @@ +from __future__ import unicode_literals +from cymem.cymem cimport Pool + +from thinc.typedefs cimport class_t +from thinc.typedefs cimport weight_t + +from ._state cimport begin_entity +from ._state cimport end_entity +from ._state cimport entity_is_open + + +ACTION_NAMES = ['' for _ in range(N_ACTIONS)] +ACTION_NAMES[MISSING] = '?' +ACTION_NAMES[SHIFT] = 'S' +ACTION_NAMES[REDUCE] = 'R' +ACTION_NAMES[OUT] = 'O' + + +cdef int set_accept_if_oracle(Move* moves, int n, State* s, + int* g_starts, int* g_ends, int* g_labels) except 0: + # If curr entity: (O invalid) + # if cost is not sunk (start matches, end is i-1 or greater + # - If i-1 == gold.end --> R=True, S=False + # - Shift if end >= i --> S=True, R=False + # else + # - If i == gold.start --> R=True, S=False + # - Else --> R=True, S=True + # Else (R invalid): + # if start == gold.start: S=True, O=False + # else: O=True, S=False + if entity_is_open(s): + g_start = g_starts[s.curr.start] + g_end = g_ends[s.curr.start] + accept_o = False + if g_start == s.curr.start and g_end == s.i: + accept_r = True + accept_s = False + elif g_start == s.curr.start and g_end > s.i: + accept_s = True + s_label = s.curr.label + accept_r = False + elif g_starts[s.i] == s.i: + accept_r = True + accept_s = False + else: + accept_r = True + accept_s = True + s_label = s.curr.label + else: + accept_r = False + if g_starts[s.i] == s.i: + accept_s = True + s_label = g_labels[s.i] + accept_o = False + else: + accept_o = True + accept_s = False + n_accept = 0 + moves[0].accept = False + for i in range(1, n): + m = &moves[i] + if m.action == SHIFT: + m.accept = accept_s and m.label == s_label + elif m.action == REDUCE: + m.accept = accept_r + elif m.action == OUT: + m.accept = accept_o + n_accept += m.accept + assert n_accept != 0 + return n_accept + + +cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0: + cdef int i + cdef bint open_ent = entity_is_open(s) + cdef int n_accept = 0 + moves[0].accept = False + for i in range(1, n): + if moves[i].action == SHIFT: + moves[i].accept = moves[i].label == s.curr.label or not entity_is_open(s) + elif moves[i].action == REDUCE: + moves[i].accept = open_ent + elif moves[i].action == OUT: + moves[i].accept = not open_ent + n_accept += moves[i].accept + return n_accept + + +cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: + cdef int first_accept = -1 + for first_accept in range(1, n): + if moves[first_accept].accept: + break + else: + raise StandardError + assert first_accept != -1 + cdef int best = first_accept + cdef weight_t score = scores[first_accept-1] + cdef int i + for i in range(first_accept+1, n): + if moves[i].accept and scores[i-1] > score: + best = i + score = scores[i-1] + return &moves[best] + + +cdef int transition(State *s, Move* move) except -1: + s.tags[s.i] = move.clas + if move.action == OUT: + s.i += 1 + elif move.action == SHIFT: + if not entity_is_open(s): + s.curr.start = s.i + s.curr.label = move.label + s.i += 1 + elif move.action == REDUCE: + s.curr.end = s.i + s.ents[s.j] = s.curr + s.j += 1 + s.curr.start = 0 + s.curr.label = -1 + s.curr.end = 0 + else: + raise ValueError(move.action) + + +def get_n_moves(n_tags): + return 1 + 1 + 1 + n_tags + + +cdef int fill_moves(Move* moves, int n, list entity_types) except -1: + cdef Move* m + label_names = {'-': 0} + # Reserve class 0 + cdef int i = 0 + moves[i].clas = i + moves[i].action = MISSING + moves[i].label = 0 + i += 1 + for entity_type in entity_types: + moves[i].action = SHIFT + moves[i].label = label_names.setdefault(entity_type, len(label_names)) + moves[i].clas = i + i += 1 + moves[i].clas = i + moves[i].action = OUT + moves[i].label = 0 + i += 1 + moves[i].action = REDUCE + moves[i].clas = i + moves[i].label = 0 + i += 1 diff --git a/spacy/ner/pystate.pxd b/spacy/ner/pystate.pxd new file mode 100644 index 000000000..9293fae01 --- /dev/null +++ b/spacy/ner/pystate.pxd @@ -0,0 +1,16 @@ +from cymem.cymem cimport Pool + +from .structs cimport Move, State + + +cdef class PyState: + cdef Pool mem + cdef readonly list tag_names + cdef readonly int n_classes + cdef readonly dict moves_by_name + + cdef Move* _moves + cdef Move* _golds + cdef State* _s + + cdef Move* _get_move(self, unicode move_name) except NULL diff --git a/spacy/ner/pystate.pyx b/spacy/ner/pystate.pyx new file mode 100644 index 000000000..ba18c2f07 --- /dev/null +++ b/spacy/ner/pystate.pyx @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +from ._state cimport init_state +from ._state cimport entity_is_open +from .bilou_moves cimport fill_moves +from .bilou_moves cimport transition +from .bilou_moves cimport set_accept_if_valid, set_accept_if_oracle +from .bilou_moves import get_n_moves +from .bilou_moves import ACTION_NAMES + + +cdef class PyState: + def __init__(self, tag_names, n_tokens): + self.mem = Pool() + self.tag_names = tag_names + self.n_classes = len(tag_names) + assert self.n_classes != 0 + self._moves = self.mem.alloc(self.n_classes, sizeof(Move)) + fill_moves(self._moves, tag_names) + self._s = init_state(self.mem, n_tokens) + self._golds = self.mem.alloc(n_tokens, sizeof(Move)) + + cdef Move* _get_move(self, unicode move_name) except NULL: + return &self._moves[self.tag_names.index(move_name)] + + def set_golds(self, list gold_names): + cdef Move* m + for i, name in enumerate(gold_names): + m = self._get_move(name) + self._golds[i] = m[0] + + def transition(self, unicode move_name): + cdef Move* m = self._get_move(move_name) + transition(self._s, m) + + def is_valid(self, unicode move_name): + cdef Move* m = self._get_move(move_name) + set_accept_if_valid(self._moves, self.n_classes, self._s) + return m.accept + + def is_gold(self, unicode move_name): + cdef Move* m = self._get_move(move_name) + set_accept_if_oracle(self._moves, self._golds, self.n_classes, self._s) + return m.accept + + property ent: + def __get__(self): + return self._s.curr + + property n_ents: + def __get__(self): + return self._s.j + + property i: + def __get__(self): + return self._s.i + + property open_entity: + def __get__(self): + return entity_is_open(self._s) diff --git a/spacy/ner/structs.pxd b/spacy/ner/structs.pxd new file mode 100644 index 000000000..7d6ebed19 --- /dev/null +++ b/spacy/ner/structs.pxd @@ -0,0 +1,23 @@ +from thinc.typedefs cimport class_t + + +cdef struct Entity: + int start + int end + int label + + +cdef struct State: + Entity curr + Entity* ents + int* tags + int i + int j + int length + + +cdef struct Move: + class_t clas + int action + int label + bint accept