diff --git a/spacy/ner/__init__.pxd b/spacy/ner/__init__.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/ner/__init__.py b/spacy/ner/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/ner/_feats.pxd b/spacy/ner/_feats.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/ner/_feats.pyx b/spacy/ner/_feats.pyx deleted file mode 100644 index c1b6e1c35..000000000 --- a/spacy/ner/_feats.pyx +++ /dev/null @@ -1,169 +0,0 @@ -from spacy.context cimport FIELD_IDS, Token - - -cdef Token P4 = FIELD_IDS.P4 -cdef Token P3 = FIELD_IDS.P3 -cdef Token P2 = FIELD_IDS.P2 -cdef Token P1 = FIELD_IDS.P1 -cdef Token N0 = FIELD_IDS.N0 -cdef Token N1 = FIELD_IDS.N1 -cdef Token N2 = FIELD_IDS.N2 -cdef Token N3 = FIELD_IDS.N3 -cdef Token N4 = FIELD_IDS.N4 - -""" -TEMPLATES = ( - (N0.sic,), - (N0.cluster,), - - (P1.pos,), - (P1.sic,), - - (N1.norm,), - (N1.pos,), - - (P1.ner,), - (P2.ner,), - - (N0.cluster,), - (P1.cluster,), - (N1.cluster,), - - (N0.is_alpha,), - (N0.is_digit,), - (N0.is_title,), - (N0.is_upper,), - - (N0.is_title, N0.oft_title), - (N0.is_upper, N0.oft_upper), - - (P1.cluster, N0.norm), - (N0.norm, N1.cluster), - - (P1.ner, N0.pos), - (P2.ner, P1.ner, N0.pos), - - (P2.pos, P1.pos, N0.sic), - (N0.sic, N1.pos, N2.pos) -) -""" - -LOCAL = ( - (N0.sic,), - (P1.sic,), - (N1.sic,), - (P2.sic,), - (N2.sic,), - (P3.sic,), - (N3.sic,), - (P4.sic,), - (N4.sic,), - - (P1.sic, N0.sic,), - (N0.sic, N1.sic), - - (N0.prefix,), - (N0.suffix,), - - (P1.shape,), - (N0.shape,), - (N1.shape,), - (P1.shape, N0.shape,), - (N0.shape, P1.shape,), - (P1.shape, N0.shape, N1.shape), - (N2.shape,), - (P2.shape,), - (P3.shape,), - (N3.shape,), - (P4.shape,), - (N4.shape,), - - (P2.norm, P1.norm, N0.norm), - (P1.norm, N0.norm, N1.norm), - (N0.norm, N1.norm, N2.norm) -) - -BOOLS = ( - (N0.is_title,), -) - - -HISTORY = ( - (P1.ner,), - (P1.ner, N0.sic,), - (P2.ner,), - (P2.ner, P1.ner), - (P2.ner, P1.ner, N0.sic), - (P2.pos, P1.ner, N0.pos), - (P2.ner, P1.pos, N0.pos), - (P3.ner,), - (P4.ner,), -) - -POS = ( - (P4.pos,), - (P3.pos,), - (P2.pos,), - (P1.pos,), - (N0.pos,), - (N1.pos,), - (N2.pos,), - (N3.pos,), - (N4.pos,), - - (P1.pos, N0.pos), - (N0.pos, N1.pos), - (P2.pos, P1.pos, N0.pos), - (P1.pos, N0.pos, N1.pos), - (N0.pos, N1.pos, N2.pos) -) - -CLUSTERS = ( - (P4.cluster,), - (P3.cluster,), - (P2.cluster,), - (P1.cluster,), - (N0.cluster,), - (N1.cluster,), - (N2.cluster,), - (N3.cluster,), - (N4.cluster,), - - (P1.cluster, N0.cluster), - (N0.cluster, N1.cluster), -) - - -CLUSTER_POS = ( - (P1.cluster, N0.pos), - (N0.pos, P1.cluster), - (N0.cluster, N1.pos), - (N0.pos, N1.cluster) -) - - -GAZ = ( - (N0.in_males,), - (N0.in_females,), - (N0.in_surnames,), - (N0.in_places,), - (N0.in_games,), - (N0.in_celebs,), - (N0.in_names,), - (P1.in_males,), - (P1.in_females,), - (P1.in_surnames,), - (P1.in_places,), - (P1.in_games,), - (P1.in_celebs,), - (P1.in_names,), - (N1.in_males,), - (N1.in_females,), - (N1.in_surnames,), - (N1.in_places,), - (N1.in_games,), - (N1.in_celebs,), - (N1.in_names,), -) - -TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS diff --git a/spacy/ner/_state.pxd b/spacy/ner/_state.pxd deleted file mode 100644 index 43b37d3bd..000000000 --- a/spacy/ner/_state.pxd +++ /dev/null @@ -1,12 +0,0 @@ -from cymem.cymem cimport Pool -from .structs cimport State, Entity, Move - -cdef int begin_entity(State* s, label) except -1 - -cdef int end_entity(State* s) except -1 - -cdef State* init_state(Pool mem, int sent_length) except NULL - -cdef bint entity_is_open(State *s) except -1 - -cdef bint entity_is_sunk(State *s, Move* golds) except -1 diff --git a/spacy/ner/_state.pyx b/spacy/ner/_state.pyx deleted file mode 100644 index 7f1892371..000000000 --- a/spacy/ner/_state.pyx +++ /dev/null @@ -1,44 +0,0 @@ -from .bilou_moves cimport BEGIN, UNIT - - -cdef int begin_entity(State* s, label) except -1: - s.curr.start = s.i - s.curr.label = label - - -cdef int end_entity(State* s) except -1: - s.curr.end = s.i - s.ents[s.j] = s.curr - s.j += 1 - s.curr.start = 0 - s.curr.label = -1 - s.curr.end = 0 - - -cdef State* init_state(Pool mem, int sent_length) except NULL: - s = mem.alloc(1, sizeof(State)) - s.j = 0 - s.ents = mem.alloc(sent_length, sizeof(Entity)) - for i in range(sent_length): - s.ents[i].label = -1 - s.curr.label = -1 - s.tags = mem.alloc(sent_length, sizeof(int)) - s.length = sent_length - return s - - -cdef bint entity_is_open(State *s) except -1: - return s.curr.label != -1 - - -cdef bint entity_is_sunk(State *s, Move* golds) except -1: - if not entity_is_open(s): - return False - - cdef Move* gold = &golds[s.curr.start] - if gold.action != BEGIN and gold.action != UNIT: - return True - elif gold.label != s.curr.label: - return True - else: - return False diff --git a/spacy/ner/annot.pxd b/spacy/ner/annot.pxd deleted file mode 100644 index b1b49d64f..000000000 --- a/spacy/ner/annot.pxd +++ /dev/null @@ -1,8 +0,0 @@ -from cymem.cymem cimport Pool - -cdef class NERAnnotation: - cdef Pool mem - cdef int* starts - cdef int* ends - cdef int* labels - cdef readonly list entities diff --git a/spacy/ner/annot.pyx b/spacy/ner/annot.pyx deleted file mode 100644 index a1e582e5c..000000000 --- a/spacy/ner/annot.pyx +++ /dev/null @@ -1,94 +0,0 @@ -from libc.string cimport memset - - -cdef class NERAnnotation: - def __init__(self, entities, length, entity_types): - self.mem = Pool() - self.starts = self.mem.alloc(length, sizeof(int)) - self.ends = self.mem.alloc(length, sizeof(int)) - self.labels = self.mem.alloc(length, sizeof(int)) - self.entities = entities - memset(self.starts, -1, sizeof(int) * length) - memset(self.ends, -1, sizeof(int) * length) - memset(self.labels, -1, sizeof(int) * length) - - cdef int start, end, label - for start, end, label in entities: - for i in range(start, end): - self.starts[i] = start - self.ends[i] = end - self.labels[i] = label - - @classmethod - def from_bilous(cls, tag_strs, entity_types): - entities = [] - start = None - for i, tag_str in enumerate(tag_strs): - if tag_str == 'O' or tag_str == '-': - continue - move, label_str = tag_str.split('-') - label = entity_types.index(label_str) - if label == -1: - label = len(entity_types) - entity_types.append(label) - if move == 'U': - assert start is None - entities.append((i, i+1, label)) - elif move == 'B': - assert start is None - start = i - elif move == 'L': - assert start is not None - entities.append((start, i+1, label)) - start = None - return cls(entities, len(tag_strs), entity_types) - - - -def read_iob(file_, entity_types, create_tokens): - sent_strs = file_.read().strip().split('\n\n') - sents = [] - for sent_str in sent_strs: - if sent_str.startswith('-DOCSTART-'): - continue - words = [] - iob = [] - for token_str in sent_str.split('\n'): - word, pos, chunk, ner = token_str.split() - words.append(word) - iob.append(ner) - bilou = iob_to_bilou(iob) - tokens = create_tokens(words) - sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types))) - return sents - - -def iob_to_bilou(tags): - out = [] - curr_label = None - tags = list(tags) - while tags: - out.extend(_consume_os(tags)) - out.extend(_consume_ent(tags)) - return out - -def _consume_os(tags): - while tags and tags[0] == 'O': - yield tags.pop(0) - -def _consume_ent(tags): - if not tags: - return [] - target = tags.pop(0).replace('B', 'I') - length = 1 - while tags and tags[0] == target: - length += 1 - tags.pop(0) - label = target[2:] - if length == 1: - return ['U-' + label] - else: - start = 'B-' + label - end = 'L-' + label - middle = ['I-%s' % label for _ in range(1, length - 1)] - return [start] + middle + [end] diff --git a/spacy/ner/bilou_moves.pxd b/spacy/ner/bilou_moves.pxd deleted file mode 100644 index 20ec58291..000000000 --- a/spacy/ner/bilou_moves.pxd +++ /dev/null @@ -1,27 +0,0 @@ -from cymem.cymem cimport Pool - -from thinc.typedefs cimport class_t -from thinc.typedefs cimport weight_t - -from .structs cimport State, Move - - -cpdef enum ActionType: - MISSING - BEGIN - IN - LAST - UNIT - OUT - N_ACTIONS - - -cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0 - -cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0 - -cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL - -cdef int transition(State *s, Move* m) except -1 - -cdef int fill_moves(Move* moves, list tag_names) except -1 diff --git a/spacy/ner/bilou_moves.pyx b/spacy/ner/bilou_moves.pyx deleted file mode 100644 index a73a48135..000000000 --- a/spacy/ner/bilou_moves.pyx +++ /dev/null @@ -1,207 +0,0 @@ -from __future__ import unicode_literals - -from ._state cimport begin_entity -from ._state cimport end_entity -from ._state cimport entity_is_open -from ._state cimport entity_is_sunk - - -ACTION_NAMES = ['' for _ in range(N_ACTIONS)] -ACTION_NAMES[MISSING] = '?' -ACTION_NAMES[BEGIN] = 'B' -ACTION_NAMES[IN] = 'I' -ACTION_NAMES[LAST] = 'L' -ACTION_NAMES[UNIT] = 'U' -ACTION_NAMES[OUT] = 'O' - - -cdef bint can_begin(State* s, int label): - return not entity_is_open(s) - - -cdef bint can_in(State* s, int label): - return entity_is_open(s) and s.curr.label == label - - -cdef bint can_last(State* s, int label): - return entity_is_open(s) and s.curr.label == label - - -cdef bint can_unit(State* s, int label): - return not entity_is_open(s) - - -cdef bint can_out(State* s, int label): - return not entity_is_open(s) - - -cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag, - ActionType next_act, bint is_sunk): - if g_act == MISSING: - return True - if act == BEGIN: - if g_act == BEGIN: - # B, Gold B --> Label match - return tag == g_tag - else: - # B, Gold I --> False (P) - # B, Gold L --> False (P) - # B, Gold O --> False (P) - # B, Gold U --> False (P) - return False - elif act == IN: - if g_act == BEGIN: - # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk) - return True - elif g_act == IN: - # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk) - return True - elif g_act == LAST: - # I, Gold L --> True iff this entity sunk and next tag == O - return is_sunk and (next_act == OUT or next_act == MISSING) - elif g_act == OUT: - # I, Gold O --> True iff next tag == O - return next_act == OUT or next_act == MISSING - elif g_act == UNIT: - # I, Gold U --> True iff next tag == O - return next_act == OUT - elif act == LAST: - if g_act == BEGIN: - # L, Gold B --> True - return True - elif g_act == IN: - # L, Gold I --> True iff this entity sunk - return is_sunk - elif g_act == LAST: - # L, Gold L --> True - return True - elif g_act == OUT: - # L, Gold O --> True - return True - elif g_act == UNIT: - # L, Gold U --> True - return True - elif act == OUT: - if g_act == BEGIN: - # O, Gold B --> False - return False - elif g_act == IN: - # O, Gold I --> True - return True - elif g_act == LAST: - # O, Gold L --> True - return True - elif g_act == OUT: - # O, Gold O --> True - return True - elif g_act == UNIT: - # O, Gold U --> False - return False - elif act == UNIT: - if g_act == UNIT: - # U, Gold U --> True iff tag match - return tag == g_tag - else: - # U, Gold B --> False - # U, Gold I --> False - # U, Gold L --> False - # U, Gold O --> False - return False - - -cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0: - cdef int n_accept = 0 - cdef Move* m - moves[0].accept = False - for i in range(1, n_classes): - m = &moves[i] - if m.action == BEGIN: - m.accept = can_begin(s, m.label) - elif m.action == IN: - m.accept = can_in(s, m.label) - elif m.action == LAST: - m.accept = can_last(s, m.label) - elif m.action == UNIT: - m.accept = can_unit(s, m.label) - elif m.action == OUT: - m.accept = can_out(s, m.label) - n_accept += m.accept - assert n_accept != 0 - return n_accept - - -cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0: - - cdef Move* g = &golds[s.i] - cdef ActionType next_act = golds[s.i+1].action if s.i < s.length else OUT - cdef bint is_sunk = entity_is_sunk(s, golds) - cdef Move* m - cdef int n_accept = 0 - set_accept_if_valid(moves, n_classes, s) - for i in range(1, n_classes): - m = &moves[i] - if not m.accept: - continue - m.accept = is_oracle(m.action, m.label, g.action, - g.label, next_act, is_sunk) - n_accept += m.accept - assert n_accept != 0 - return n_accept - - -cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: - cdef int first_accept = -1 - for first_accept in range(1, n): - if moves[first_accept].accept: - break - else: - raise StandardError - assert first_accept != -1 - cdef int best = first_accept - cdef weight_t score = scores[first_accept-1] - cdef int i - for i in range(first_accept+1, n): - if moves[i].accept and scores[i-1] > score: - best = i - score = scores[i-1] - return &moves[best] - - -cdef int transition(State *s, Move* move) except -1: - if move.action == BEGIN: - begin_entity(s, move.label) - elif move.action == IN: - pass - elif move.action == LAST: - end_entity(s) - elif move.action == UNIT: - begin_entity(s, move.label) - end_entity(s) - elif move.action == OUT: - pass - s.tags[s.i] = move.clas - s.i += 1 - - -def get_n_moves(n_tags): - return n_tags + n_tags + n_tags + n_tags + 1 - - -cdef int fill_moves(Move* moves, list tag_names) except -1: - cdef Move* m - label_names = {'-': 0} - for i, tag_name in enumerate(tag_names): - m = &moves[i] - if '-' in tag_name: - action_str, label = tag_name.split('-') - elif tag_name == 'O': - action_str = 'O' - label = '-' - elif tag_name == 'NULL' or tag_name == 'EOL': - action_str = '?' - label = '-' - else: - raise StandardError(tag_name) - m.action = ACTION_NAMES.index(action_str) - m.label = label_names.setdefault(label, len(label_names)) - m.clas = i diff --git a/spacy/ner/context.pxd b/spacy/ner/context.pxd deleted file mode 100644 index 433334765..000000000 --- a/spacy/ner/context.pxd +++ /dev/null @@ -1,151 +0,0 @@ -from thinc.typedefs cimport atom_t -from ..typedefs cimport hash_t -from ..tokens cimport Tokens -from ..lexeme cimport Lexeme -from .structs cimport State - - -cpdef enum: - T_sic - T_cluster - T_norm - T_shape - T_asciied - T_prefix - T_suffix - T_length - T_postype - T_nertype - T_sensetype - T_is_alpha - T_is_ascii - T_is_digit - T_is_lower - T_is_punct - T_is_space - T_is_title - T_is_upper - T_like_url - T_like_number - T_oft_lower - T_oft_title - T_oft_upper - T_in_males - T_in_females - T_in_surnames - T_in_places - T_in_celebs - T_in_names - T_pos - T_sense - T_ner - - -cpdef enum: - P2_sic - P2_cluster - P2_norm - P2_shape - P2_prefix - P2_suffix - P2_length - P2_postype - P2_is_alpha - P2_is_digit - P2_is_lower - P2_is_punct - P2_is_title - P2_is_upper - P2_like_number - P2_pos - - P1_sic - P1_cluster - P1_norm - P1_shape - P1_prefix - P1_suffix - P1_length - P1_postype - P1_is_alpha - P1_is_digit - P1_is_lower - P1_is_punct - P1_is_title - P1_is_upper - P1_like_number - P1_pos - - W_sic - W_cluster - W_norm - W_shape - W_prefix - W_suffix - W_length - W_postype - W_is_alpha - W_is_digit - W_is_lower - W_is_punct - W_is_space - W_is_title - W_is_upper - W_like_number - W_pos - - N1_sic - N1_cluster - N1_norm - N1_shape - N1_prefix - N1_suffix - N1_length - N1_postype - N1_is_alpha - N1_is_ascii - N1_is_digit - N1_is_lower - N1_is_punct - N1_is_space - N1_is_title - N1_is_upper - N1_like_number - N1_pos - - N2_sic - N2_cluster - N2_norm - N2_shape - N2_asciied - N2_prefix - N2_suffix - N2_length - N2_postype - N2_is_alpha - N2_is_digit - N2_is_lower - N2_is_punct - N2_is_space - N2_is_title - N2_is_upper - N2_like_number - N2_pos - N2_sense - - E0_sic - E0_cluster - E0_pos - - E1_sic - E1_cluster - E1_pos - - E_last_sic - E_last_cluster - E_last_pos - - N_FIELDS - - -cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1 diff --git a/spacy/ner/context.pyx b/spacy/ner/context.pyx deleted file mode 100644 index f6beb1501..000000000 --- a/spacy/ner/context.pyx +++ /dev/null @@ -1,76 +0,0 @@ -from libc.string cimport memset - -from murmurhash.mrmr cimport hash64 -from ._state cimport entity_is_open -from ..lexeme cimport * - - -cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos): - c[T_sic] = lex.sic - c[T_cluster] = lex.cluster - c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape - c[T_shape] = lex.shape - c[T_asciied] = lex.asciied - c[T_prefix] = lex.prefix - c[T_suffix] = lex.suffix - c[T_length] = lex.length - - c[T_postype] = lex.postype - c[T_nertype] = 0 - c[T_sensetype] = 0 - - c[T_is_alpha] = lex.flags & (1 << IS_ALPHA) - c[T_is_digit] = lex.flags & (1 << IS_DIGIT) - c[T_is_lower] = lex.flags & (1 << IS_LOWER) - c[T_is_punct] = lex.flags & (1 << IS_PUNCT) - c[T_is_space] = lex.flags & (1 << IS_SPACE) - c[T_is_title] = lex.flags & (1 << IS_TITLE) - c[T_is_upper] = lex.flags & (1 << IS_UPPER) - c[T_like_url] = lex.flags & (1 << LIKE_URL) - c[T_like_number] = lex.flags & (1 << LIKE_NUMBER) - c[T_oft_lower] = lex.flags & (1 << OFT_LOWER) - c[T_oft_title] = lex.flags & (1 << OFT_TITLE) - c[T_oft_upper] = lex.flags & (1 << OFT_UPPER) - - c[T_in_males] = lex.flags & (1 << IN_MALES) - c[T_in_females] = lex.flags & (1 << IN_FEMALES) - c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES) - c[T_in_places] = lex.flags & (1 << IN_PLACES) - c[T_in_celebs] = lex.flags & (1 << IN_CELEBS) - c[T_in_names] = lex.flags & (1 << IN_NAMES) - - c[T_pos] = pos - c[T_sense] = 0 - - -cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos): - c[0] = lex.sic - c[1] = lex.cluster - c[2] = lex.shape - c[3] = pos - - -cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1: - cdef int i - for i in range(N_FIELDS): - context[i] = 0 - i = s.i - _fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2]) - _fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1]) - _fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i]) - _fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1]) - _fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2]) - - cdef atom_t[5] ent_vals - if entity_is_open(s): - context[E0_sic] = tokens.lex[s.curr.start].sic - context[E0_cluster] = tokens.lex[s.curr.start].cluster - context[E0_pos] = tokens.pos[s.curr.start] - context[E_last_sic] = tokens.lex[s.i-1].sic - context[E_last_cluster] = tokens.lex[s.i-1].cluster - context[E_last_pos] = tokens.pos[s.i-1] - if (s.curr.start + 1) < s.i: - context[E1_sic] = tokens.lex[s.curr.start+1].sic - context[E1_cluster] = tokens.lex[s.curr.start+1].cluster - context[E1_pos] = tokens.pos[s.curr.start+1] - return 1 diff --git a/spacy/ner/feats.pxd b/spacy/ner/feats.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/ner/feats.pyx b/spacy/ner/feats.pyx deleted file mode 100644 index b1657716e..000000000 --- a/spacy/ner/feats.pyx +++ /dev/null @@ -1,99 +0,0 @@ -from .context import * - - -LOCAL = ( - (W_sic,), - (P1_sic,), - (N1_sic,), - (P2_sic,), - (N2_sic,), - - (P1_sic, W_sic,), - (W_sic, N1_sic), - - (W_prefix,), - (W_suffix,), - - (P1_shape,), - (W_shape,), - (N1_shape,), - (P1_shape, W_shape,), - (W_shape, P1_shape,), - (P1_shape, W_shape, N1_shape), - (N2_shape,), - (P2_shape,), - - (P2_norm, P1_norm, W_norm), - (P1_norm, W_norm, N1_norm), - (W_norm, N1_norm, N2_norm) -) - -POS = ( - (P2_pos,), - (P1_pos,), - (W_pos,), - (N1_pos,), - (N2_pos,), - - (P1_pos, W_pos), - (W_pos, N1_pos), - (P2_pos, P1_pos, W_pos), - (P1_pos, W_pos, N1_pos), - (W_pos, N1_pos, N2_pos) -) - -CLUSTERS = ( - (P2_cluster,), - (P1_cluster,), - (W_cluster,), - (N1_cluster,), - (N2_cluster,), - - (P1_cluster, W_cluster), - (W_cluster, N1_cluster), -) - - -CLUSTER_POS = ( - (P1_cluster, W_pos), - (W_pos, P1_cluster), - (W_cluster, N1_pos), - (W_pos, N1_cluster) -) - - -STATE = ( - (E0_sic,), - (E0_cluster,), - (E0_pos,), - (E_last_sic,), - (E_last_cluster,), - (E_last_pos,), - - (E0_sic, W_sic), - (E0_cluster, W_cluster), - (E0_pos, W_pos), - (E_last_sic, W_sic), - (E_last_pos, W_pos), - - (E0_pos, E_last_pos, W_pos), - (E0_cluster, E_last_cluster, W_cluster), - - (E0_sic, E_last_sic), - (E0_pos, E_last_pos), - (E0_cluster, E_last_cluster), - (E0_pos, E_last_cluster), - (E0_cluster, E_last_pos), - - (E1_sic,), - (E1_cluster,), - (E1_pos,), - - (E0_sic, E1_sic), - (E0_sic, E1_pos,), - (E0_pos, E1_sic,), - (E0_pos, E1_pos), -) - - -TEMPLATES = LOCAL + CLUSTERS + POS + CLUSTER_POS + STATE diff --git a/spacy/ner/greedy_parser.pxd b/spacy/ner/greedy_parser.pxd deleted file mode 100644 index 9ee4d668d..000000000 --- a/spacy/ner/greedy_parser.pxd +++ /dev/null @@ -1,29 +0,0 @@ -from cymem.cymem cimport Pool -from thinc.features cimport Extractor -from thinc.learner cimport LinearModel -from thinc.typedefs cimport * - -from ..tokens cimport Tokens -from ..typedefs cimport * - -from .structs cimport Move -from .annot cimport NERAnnotation - - -cdef class NERParser: - cdef Pool mem - cdef Extractor extractor - cdef LinearModel model - cdef readonly list tag_names - cdef readonly list entity_types - cdef readonly int n_classes - - cdef Move* _moves - cdef atom_t* _context - cdef feat_t* _feats - cdef weight_t* _values - cdef weight_t* _scores - - - cpdef list train(self, Tokens tokens, NERAnnotation annot) - cpdef list set_tags(self, Tokens tokens) diff --git a/spacy/ner/greedy_parser.pyx b/spacy/ner/greedy_parser.pyx deleted file mode 100644 index 94d096529..000000000 --- a/spacy/ner/greedy_parser.pyx +++ /dev/null @@ -1,139 +0,0 @@ -from __future__ import division -from __future__ import unicode_literals - -cimport cython -import random -import os -from os import path -import shutil -import json - -from thinc.features cimport ConjFeat - -from .context cimport fill_context -from .context cimport N_FIELDS -from .structs cimport Move, State -from .io_moves cimport fill_moves, transition, best_accepted -from .io_moves cimport set_accept_if_valid, set_accept_if_oracle -from .io_moves import get_n_moves -from ._state cimport init_state -from ._state cimport entity_is_open -from ._state cimport end_entity -from .annot cimport NERAnnotation - - -def setup_model_dir(entity_types, templates, model_dir): - if path.exists(model_dir): - shutil.rmtree(model_dir) - os.mkdir(model_dir) - config = { - 'templates': templates, - 'entity_types': entity_types, - } - with open(path.join(model_dir, 'config.json'), 'w') as file_: - json.dump(config, file_) - - -def train(train_sents, model_dir, nr_iter=10): - cdef Tokens tokens - cdef NERAnnotation gold_ner - parser = NERParser(model_dir) - for _ in range(nr_iter): - tp = 0 - fp = 0 - fn = 0 - for i, (tokens, gold_ner) in enumerate(train_sents): - #print [tokens[i].string for i in range(tokens.length)] - test_ents = set(parser.train(tokens, gold_ner)) - #print 'Test', test_ents - gold_ents = set(gold_ner.entities) - #print 'Gold', set(gold_ner.entities) - tp += len(gold_ents.intersection(test_ents)) - fp += len(test_ents - gold_ents) - fn += len(gold_ents - test_ents) - p = tp / (tp + fp) - r = tp / (tp + fn) - f = 2 * ((p * r) / (p + r)) - print 'P: %.3f' % p, - print 'R: %.3f' % r, - print 'F: %.3f' % f - random.shuffle(train_sents) - parser.model.end_training() - parser.model.dump(path.join(model_dir, 'model')) - - -cdef class NERParser: - def __init__(self, model_dir): - self.mem = Pool() - cfg = json.load(open(path.join(model_dir, 'config.json'))) - templates = cfg['templates'] - self.extractor = Extractor(templates, [ConjFeat] * len(templates)) - self.entity_types = cfg['entity_types'] - self.n_classes = get_n_moves(len(self.entity_types)) - self._moves = self.mem.alloc(self.n_classes, sizeof(Move)) - fill_moves(self._moves, self.n_classes, self.entity_types) - self.model = LinearModel(self.n_classes) - if path.exists(path.join(model_dir, 'model')): - self.model.load(path.join(model_dir, 'model')) - - self._context = self.mem.alloc(N_FIELDS, sizeof(atom_t)) - self._feats = self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) - self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) - self._scores = self.mem.alloc(self.model.nr_class, sizeof(weight_t)) - - cpdef list train(self, Tokens tokens, NERAnnotation annot): - cdef Pool mem = Pool() - cdef State* s = init_state(mem, tokens.length) - cdef Move* guess - cdef Move* oracle_move - n_correct = 0 - cdef int f = 0 - while s.i < tokens.length: - fill_context(self._context, s, tokens) - self.extractor.extract(self._feats, self._values, self._context, NULL) - self.model.score(self._scores, self._feats, self._values) - - set_accept_if_valid(self._moves, self.n_classes, s) - guess = best_accepted(self._moves, self._scores, self.n_classes) - assert guess.clas != 0 - set_accept_if_oracle(self._moves, self.n_classes, s, - annot.starts, annot.ends, annot.labels) - oracle_move = best_accepted(self._moves, self._scores, self.n_classes) - assert oracle_move.clas != 0 - if guess.clas == oracle_move.clas: - counts = {} - n_correct += 1 - else: - counts = {guess.clas: {}, oracle_move.clas: {}} - self.extractor.count(counts[oracle_move.clas], self._feats, 1) - self.extractor.count(counts[guess.clas], self._feats, -1) - self.model.update(counts) - transition(s, guess) - tokens.ner[s.i-1] = s.tags[s.i-1] - if entity_is_open(s): - s.curr.label = annot.labels[s.curr.start] - end_entity(s) - entities = [] - for i in range(s.j): - entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label)) - return entities - - cpdef list set_tags(self, Tokens tokens): - cdef Pool mem = Pool() - cdef State* s = init_state(mem, tokens.length) - cdef Move* move - while s.i < tokens.length: - fill_context(self._context, s, tokens) - self.extractor.extract(self._feats, self._values, self._context, NULL) - self.model.score(self._scores, self._feats, self._values) - set_accept_if_valid(self._moves, self.n_classes, s) - move = best_accepted(self._moves, self._scores, self.n_classes) - transition(s, move) - tokens.ner[s.i-1] = s.tags[s.i-1] - if entity_is_open(s): - s.curr.label = move.label - end_entity(s) - entities = [] - for i in range(s.j): - entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label)) - return entities diff --git a/spacy/ner/io_moves.pxd b/spacy/ner/io_moves.pxd deleted file mode 100644 index 50f6be106..000000000 --- a/spacy/ner/io_moves.pxd +++ /dev/null @@ -1,26 +0,0 @@ -from cymem.cymem cimport Pool - -from thinc.typedefs cimport class_t -from thinc.typedefs cimport weight_t - -from .structs cimport State, Move - - -cpdef enum ActionType: - MISSING - SHIFT - REDUCE - OUT - N_ACTIONS - - -cdef int set_accept_if_oracle(Move* moves, int n, State* s, - int* g_starts, int* g_ends, int* g_labels) except 0 - -cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0 - -cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL - -cdef int transition(State *s, Move* m) except -1 - -cdef int fill_moves(Move* moves, int n, list entity_types) except -1 diff --git a/spacy/ner/io_moves.pyx b/spacy/ner/io_moves.pyx deleted file mode 100644 index 257a18f3c..000000000 --- a/spacy/ner/io_moves.pyx +++ /dev/null @@ -1,152 +0,0 @@ -from __future__ import unicode_literals -from cymem.cymem cimport Pool - -from thinc.typedefs cimport class_t -from thinc.typedefs cimport weight_t - -from ._state cimport begin_entity -from ._state cimport end_entity -from ._state cimport entity_is_open - - -ACTION_NAMES = ['' for _ in range(N_ACTIONS)] -ACTION_NAMES[MISSING] = '?' -ACTION_NAMES[SHIFT] = 'S' -ACTION_NAMES[REDUCE] = 'R' -ACTION_NAMES[OUT] = 'O' - - -cdef int set_accept_if_oracle(Move* moves, int n, State* s, - int* g_starts, int* g_ends, int* g_labels) except 0: - # If curr entity: (O invalid) - # if cost is not sunk (start matches, end is i-1 or greater - # - If i-1 == gold.end --> R=True, S=False - # - Shift if end >= i --> S=True, R=False - # else - # - If i == gold.start --> R=True, S=False - # - Else --> R=True, S=True - # Else (R invalid): - # if start == gold.start: S=True, O=False - # else: O=True, S=False - if entity_is_open(s): - g_start = g_starts[s.curr.start] - g_end = g_ends[s.curr.start] - accept_o = False - if g_start == s.curr.start and g_end == s.i: - accept_r = True - accept_s = False - elif g_start == s.curr.start and g_end > s.i: - accept_s = True - s_label = s.curr.label - accept_r = False - elif g_starts[s.i] == s.i: - accept_r = True - accept_s = False - else: - accept_r = True - accept_s = True - s_label = s.curr.label - else: - accept_r = False - if g_starts[s.i] == s.i: - accept_s = True - s_label = g_labels[s.i] - accept_o = False - else: - accept_o = True - accept_s = False - n_accept = 0 - moves[0].accept = False - for i in range(1, n): - m = &moves[i] - if m.action == SHIFT: - m.accept = accept_s and m.label == s_label - elif m.action == REDUCE: - m.accept = accept_r - elif m.action == OUT: - m.accept = accept_o - n_accept += m.accept - assert n_accept != 0 - return n_accept - - -cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0: - cdef int i - cdef bint open_ent = entity_is_open(s) - cdef int n_accept = 0 - moves[0].accept = False - for i in range(1, n): - if moves[i].action == SHIFT: - moves[i].accept = moves[i].label == s.curr.label or not entity_is_open(s) - elif moves[i].action == REDUCE: - moves[i].accept = open_ent - elif moves[i].action == OUT: - moves[i].accept = not open_ent - n_accept += moves[i].accept - return n_accept - - -cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: - cdef int first_accept = -1 - for first_accept in range(1, n): - if moves[first_accept].accept: - break - else: - raise StandardError - assert first_accept != -1 - cdef int best = first_accept - cdef weight_t score = scores[first_accept-1] - cdef int i - for i in range(first_accept+1, n): - if moves[i].accept and scores[i-1] > score: - best = i - score = scores[i-1] - return &moves[best] - - -cdef int transition(State *s, Move* move) except -1: - s.tags[s.i] = move.clas - if move.action == OUT: - s.i += 1 - elif move.action == SHIFT: - if not entity_is_open(s): - s.curr.start = s.i - s.curr.label = move.label - s.i += 1 - elif move.action == REDUCE: - s.curr.end = s.i - s.ents[s.j] = s.curr - s.j += 1 - s.curr.start = 0 - s.curr.label = -1 - s.curr.end = 0 - else: - raise ValueError(move.action) - - -def get_n_moves(n_tags): - return 1 + 1 + 1 + n_tags - - -cdef int fill_moves(Move* moves, int n, list entity_types) except -1: - cdef Move* m - label_names = {'-': 0} - # Reserve class 0 - cdef int i = 0 - moves[i].clas = i - moves[i].action = MISSING - moves[i].label = 0 - i += 1 - for entity_type in entity_types: - moves[i].action = SHIFT - moves[i].label = label_names.setdefault(entity_type, len(label_names)) - moves[i].clas = i - i += 1 - moves[i].clas = i - moves[i].action = OUT - moves[i].label = 0 - i += 1 - moves[i].action = REDUCE - moves[i].clas = i - moves[i].label = 0 - i += 1 diff --git a/spacy/ner/pystate.pxd b/spacy/ner/pystate.pxd deleted file mode 100644 index 6710d9f40..000000000 --- a/spacy/ner/pystate.pxd +++ /dev/null @@ -1,16 +0,0 @@ -from cymem.cymem cimport Pool - -from .structs cimport Move, State - - -cdef class PyState: - cdef Pool mem - cdef readonly list tag_names - cdef readonly int n_classes - cdef readonly dict moves_by_name - - cdef Move* _moves - cdef Move* _golds - cdef State* _s - - cdef Move* _get_move(self, unicode move_name) except NULL diff --git a/spacy/ner/pystate.pyx b/spacy/ner/pystate.pyx deleted file mode 100644 index ba18c2f07..000000000 --- a/spacy/ner/pystate.pyx +++ /dev/null @@ -1,60 +0,0 @@ -from __future__ import unicode_literals - -from ._state cimport init_state -from ._state cimport entity_is_open -from .bilou_moves cimport fill_moves -from .bilou_moves cimport transition -from .bilou_moves cimport set_accept_if_valid, set_accept_if_oracle -from .bilou_moves import get_n_moves -from .bilou_moves import ACTION_NAMES - - -cdef class PyState: - def __init__(self, tag_names, n_tokens): - self.mem = Pool() - self.tag_names = tag_names - self.n_classes = len(tag_names) - assert self.n_classes != 0 - self._moves = self.mem.alloc(self.n_classes, sizeof(Move)) - fill_moves(self._moves, tag_names) - self._s = init_state(self.mem, n_tokens) - self._golds = self.mem.alloc(n_tokens, sizeof(Move)) - - cdef Move* _get_move(self, unicode move_name) except NULL: - return &self._moves[self.tag_names.index(move_name)] - - def set_golds(self, list gold_names): - cdef Move* m - for i, name in enumerate(gold_names): - m = self._get_move(name) - self._golds[i] = m[0] - - def transition(self, unicode move_name): - cdef Move* m = self._get_move(move_name) - transition(self._s, m) - - def is_valid(self, unicode move_name): - cdef Move* m = self._get_move(move_name) - set_accept_if_valid(self._moves, self.n_classes, self._s) - return m.accept - - def is_gold(self, unicode move_name): - cdef Move* m = self._get_move(move_name) - set_accept_if_oracle(self._moves, self._golds, self.n_classes, self._s) - return m.accept - - property ent: - def __get__(self): - return self._s.curr - - property n_ents: - def __get__(self): - return self._s.j - - property i: - def __get__(self): - return self._s.i - - property open_entity: - def __get__(self): - return entity_is_open(self._s) diff --git a/spacy/ner/structs.pxd b/spacy/ner/structs.pxd deleted file mode 100644 index 7d6ebed19..000000000 --- a/spacy/ner/structs.pxd +++ /dev/null @@ -1,23 +0,0 @@ -from thinc.typedefs cimport class_t - - -cdef struct Entity: - int start - int end - int label - - -cdef struct State: - Entity curr - Entity* ents - int* tags - int i - int j - int length - - -cdef struct Move: - class_t clas - int action - int label - bint accept