diff --git a/spacy/ner/__init__.pxd b/spacy/ner/__init__.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/ner/__init__.py b/spacy/ner/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/ner/_feats.pxd b/spacy/ner/_feats.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/ner/_feats.pyx b/spacy/ner/_feats.pyx new file mode 100644 index 000000000..18e073c5b --- /dev/null +++ b/spacy/ner/_feats.pyx @@ -0,0 +1,169 @@ +from spacy.context cimport FIELD_IDS, Token + + +cdef Token P4 = FIELD_IDS.P4 +cdef Token P3 = FIELD_IDS.P3 +cdef Token P2 = FIELD_IDS.P2 +cdef Token P1 = FIELD_IDS.P1 +cdef Token N0 = FIELD_IDS.N0 +cdef Token N1 = FIELD_IDS.N1 +cdef Token N2 = FIELD_IDS.N2 +cdef Token N3 = FIELD_IDS.N3 +cdef Token N4 = FIELD_IDS.N4 + +""" +TEMPLATES = ( + (N0.sic,), + (N0.cluster,), + + (P1.pos,), + (P1.sic,), + + (N1.norm,), + (N1.pos,), + + (P1.ner,), + (P2.ner,), + + (N0.cluster,), + (P1.cluster,), + (N1.cluster,), + + (N0.is_alpha,), + (N0.is_digit,), + (N0.is_title,), + (N0.is_upper,), + + (N0.is_title, N0.oft_title), + (N0.is_upper, N0.oft_upper), + + (P1.cluster, N0.norm), + (N0.norm, N1.cluster), + + (P1.ner, N0.pos), + (P2.ner, P1.ner, N0.pos), + + (P2.pos, P1.pos, N0.sic), + (N0.sic, N1.pos, N2.pos) +) +""" + +LOCAL = ( + (N0.sic,), + (P1.sic,), + (N1.sic,), + (P2.sic,), + (N2.sic,), + (P3.sic,), + (N3.sic,), + (P4.sic,), + (N4.sic,), + + (P1.sic, N0.sic,), + (N0.sic, N1.sic), + + (N0.prefix,), + (N0.suffix,), + + (P1.shape,), + (N0.shape,), + (N1.shape,), + (P1.shape, N0.shape,), + (N0.shape, P1.shape,), + (P1.shape, N0.shape, N1.shape), + (N2.shape,), + (P2.shape,), + (P3.shape,), + (N3.shape,), + (P4.shape,), + (N4.shape,), + + (P2.norm, P1.norm, N0.norm), + (P1.norm, N0.norm, N1.norm), + (N0.norm, N1.norm, N2.norm) +) + +BOOLS = ( + (N0.is_title,), +) + + +HISTORY = ( + (P1.ner,), + (P1.ner, N0.sic,), + (P2.ner,), + (P2.ner, P1.ner), + (P2.ner, P1.ner, N0.sic), + (P2.pos, P1.ner, N0.pos), + (P2.ner, P1.pos, N0.pos), + (P3.ner,), + (P4.ner,), +) + +POS = ( + (P4.pos,), + (P3.pos,), + (P2.pos,), + (P1.pos,), + (N0.pos,), + (N1.pos,), + (N2.pos,), + (N3.pos,), + (N4.pos,), + + (P1.pos, N0.pos), + (N0.pos, N1.pos), + (P2.pos, P1.pos, N0.pos), + (P1.pos, N0.pos, N1.pos), + (N0.pos, N1.pos, N2.pos) +) + +CLUSTERS = ( + (P4.cluster,), + (P3.cluster,), + (P2.cluster,), + (P1.cluster,), + (N0.cluster,), + (N1.cluster,), + (N2.cluster,), + (N3.cluster,), + (N4.cluster,), + + (P1.cluster, N0.cluster), + (N0.cluster, N1.cluster), +) + + +CLUSTER_POS = ( + (P1.cluster, N0.pos), + (N0.pos, P1.cluster), + (N0.cluster, N1.pos), + (N0.pos, N1.cluster) +) + + +GAZ = ( + (N0.in_males,), + (N0.in_females,), + (N0.in_surnames,), + (N0.in_places,), + (N0.in_games,), + (N0.in_celebs,), + (N0.in_names,), + (P1.in_males,), + (P1.in_females,), + (P1.in_surnames,), + (P1.in_places,), + (P1.in_games,), + (P1.in_celebs,), + (P1.in_names,), + (N1.in_males,), + (N1.in_females,), + (N1.in_surnames,), + (N1.in_places,), + (N1.in_games,), + (N1.in_celebs,), + (N1.in_names,), +) + +TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS diff --git a/spacy/ner/_state.pxd b/spacy/ner/_state.pxd new file mode 100644 index 000000000..c522e748b --- /dev/null +++ b/spacy/ner/_state.pxd @@ -0,0 +1,27 @@ +from cymem.cymem cimport Pool +from .moves cimport Move + + +cdef struct Entity: + int start + int end + int label + + +cdef struct State: + Entity* ents + int* tags + int i + int j + int length + + +cdef int begin_entity(State* s, label) except -1 + +cdef int end_entity(State* s) except -1 + +cdef State* init_state(Pool mem, int sent_length) except NULL + +cdef bint entity_is_open(State *s) except -1 + +cdef bint entity_is_sunk(State *s, Move* golds) except -1 diff --git a/spacy/ner/_state.pyx b/spacy/ner/_state.pyx new file mode 100644 index 000000000..dce8e4d45 --- /dev/null +++ b/spacy/ner/_state.pyx @@ -0,0 +1,40 @@ +from .moves cimport BEGIN, UNIT + + +cdef int begin_entity(State* s, label) except -1: + s.j += 1 + s.ents[s.j].start = s.i + s.ents[s.j].label = label + + +cdef int end_entity(State* s) except -1: + s.ents[s.j].end = s.i + 1 + + +cdef State* init_state(Pool mem, int sent_length) except NULL: + s = mem.alloc(1, sizeof(State)) + s.j = -1 + s.ents = mem.alloc(sent_length, sizeof(Entity)) + for i in range(sent_length): + s.ents[i].label = -1 + s.tags = mem.alloc(sent_length, sizeof(int)) + s.length = sent_length + return s + + +cdef bint entity_is_open(State *s) except -1: + return s.j >= 0 and s.ents[s.j].label != -1 + + +cdef bint entity_is_sunk(State *s, Move* golds) except -1: + if not entity_is_open(s): + return False + + cdef Entity* ent = &s.ents[s.j] + cdef Move* gold = &golds[ent.start] + if gold.action != BEGIN and gold.action != UNIT: + return True + elif gold.label != ent.label: + return True + else: + return False diff --git a/spacy/ner/greedy_parser.pxd b/spacy/ner/greedy_parser.pxd new file mode 100644 index 000000000..e019dd589 --- /dev/null +++ b/spacy/ner/greedy_parser.pxd @@ -0,0 +1,25 @@ +from cymem.cymem cimport Pool +from thinc.features cimport Extractor +from thinc.learner cimport LinearModel +from thinc.typedefs cimport * + +from ..tokens cimport Tokens +from ..typedefs cimport * + +from .moves cimport Move + + +cdef class NERParser: + cdef Pool mem + cdef Extractor extractor + cdef LinearModel model + + cdef Move* _moves + cdef atom_t* _context + cdef feat_t* _feats + cdef weight_t* _values + cdef weight_t* _scores + + + cpdef int train(self, Tokens tokens, golds) + cpdef int set_tags(self, Tokens tokens) except -1 diff --git a/spacy/ner/greedy_parser.pyx b/spacy/ner/greedy_parser.pyx new file mode 100644 index 000000000..2e3af5717 --- /dev/null +++ b/spacy/ner/greedy_parser.pyx @@ -0,0 +1,81 @@ +cimport cython +import random +import os +from os import path +import shutil +import json + +from thinc.features cimport ConjFeat + +from ..context cimport fill_context +from ..context cimport N_FIELDS +from .moves cimport Move +from .moves cimport fill_moves, transition, best_accepted +from .moves cimport set_accept_if_valid, set_accept_if_oracle +from .moves import get_n_moves +from ._state cimport State +from ._state cimport init_state + + +cdef class NERParser: + def __init__(self, model_dir): + self.mem = Pool() + cfg = json.load(open(path.join(model_dir, 'config.json'))) + templates = cfg['templates'] + self.entity_types = cfg['entity_types'] + self.extractor = Extractor(templates, [ConjFeat] * len(templates)) + self.n_classes = get_n_moves(len(self.entity_types)) + self._moves = self.mem.alloc(self.n_classes, sizeof(Move)) + fill_moves(self._moves, len(self.entity_types)) + self.model = LinearModel(len(self.tag_names)) + if path.exists(path.join(model_dir, 'model')): + self.model.load(path.join(model_dir, 'model')) + + self._context = self.mem.alloc(N_FIELDS, sizeof(atom_t)) + self._feats = self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) + self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) + self._scores = self.mem.alloc(self.model.nr_class, sizeof(weight_t)) + + cpdef int train(self, Tokens tokens, gold_classes): + cdef Pool mem = Pool() + cdef State* s = init_state(mem, tokens.length) + cdef Move* golds = mem.alloc(len(gold_classes), sizeof(Move)) + for i, clas in enumerate(gold_classes): + golds[i] = self.moves[clas - 1] + assert golds[i].id == clas + cdef Move* guess + while s.i < tokens.length: + fill_context(self._context, s.i, tokens) + self.extractor.extract(self._feats, self._values, self._context, NULL) + self.model.score(self._scores, self._feats, self._values) + + set_accept_if_valid(self._moves, self.n_classes, s) + guess = best_accepted(self._moves, self._scores, self.n_classes) + + set_accept_if_oracle(self._moves, golds, self.n_classes, s) # TODO + gold = best_accepted(self._moves, self._scores, self.n_classes) + + if guess.clas == gold.clas: + self.model.update({}) + return 0 + + counts = {guess.clas: {}, gold.clas: {}} + self.extractor.count(counts[gold.clas], self._feats, 1) + self.extractor.count(counts[guess.clas], self._feats, -1) + self.model.update(counts) + + transition(s, guess) + tokens.ner[s.i-1] = s.tags[s.i-1] + + cpdef int set_tags(self, Tokens tokens) except -1: + cdef Pool mem = Pool() + cdef State* s = init_state(mem, tokens.length) + cdef Move* move + while s.i < tokens.length: + fill_context(self._context, s.i, tokens) + self.extractor.extract(self._feats, self._values, self._context, NULL) + self.model.score(self._scores, self._feats, self._values) + set_accept_if_valid(self._moves, self.n_classes, s) + move = best_accepted(self._moves, self._scores, self.n_classes) + transition(s, move) + tokens.ner[s.i-1] = s.tags[s.i-1] diff --git a/spacy/ner/moves.pxd b/spacy/ner/moves.pxd new file mode 100644 index 000000000..e3063668d --- /dev/null +++ b/spacy/ner/moves.pxd @@ -0,0 +1,32 @@ +from cymem.cymem cimport Pool + +from thinc.typedefs cimport class_t +from thinc.typedefs cimport weight_t + +from ._state cimport State + +cpdef enum ActionType: + BEGIN + IN + LAST + UNIT + OUT + N_ACTIONS + + +cdef struct Move: + class_t clas + int action + int label + bint accept + + +cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0 + +cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0 + +cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL + +cdef int transition(State *s, Move* m) except -1 + +cdef int fill_moves(Move* moves, int n_tags) except -1 diff --git a/spacy/ner/moves.pyx b/spacy/ner/moves.pyx new file mode 100644 index 000000000..595fc19e6 --- /dev/null +++ b/spacy/ner/moves.pyx @@ -0,0 +1,193 @@ +from ._state cimport begin_entity +from ._state cimport end_entity +from ._state cimport entity_is_open +from ._state cimport entity_is_sunk + +ACTION_NAMES = ['' for _ in range(N_ACTIONS)] +ACTION_NAMES[BEGIN] = 'B' +ACTION_NAMES[IN] = 'I' +ACTION_NAMES[LAST] = 'L' +ACTION_NAMES[UNIT] = 'U' +ACTION_NAMES[OUT] = 'O' + + +cdef bint can_begin(State* s, int label): + return not entity_is_open(s) + + +cdef bint can_in(State* s, int label): + return entity_is_open(s) and s.ents[s.j].tag == label + + +cdef bint can_last(State* s, int label): + return entity_is_open(s) and s.ents[s.j].tag == label + + +cdef bint can_unit(State* s, int label): + return not entity_is_open(s) + + +cdef bint can_out(State* s, int label): + return not entity_is_open(s) + + +cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag, + ActionType next_act, bint is_sunk): + if act == BEGIN: + if g_act == BEGIN: + # B, Gold B --> Label match + return tag == g_tag + else: + # B, Gold I --> False (P) + # B, Gold L --> False (P) + # B, Gold O --> False (P) + # B, Gold U --> False (P) + return False + elif act == IN: + if g_act == BEGIN: + # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk) + return True + elif g_act == IN: + # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk) + return True + elif g_act == LAST: + # I, Gold L --> True iff this entity sunk and next tag == O + return is_sunk and next_act == OUT + elif g_act == OUT: + # I, Gold O --> True iff next tag == O + return next_act == OUT + elif g_act == UNIT: + # I, Gold U --> True iff next tag == O + return next_act == OUT + elif act == LAST: + if g_act == BEGIN: + # L, Gold B --> True + return True + elif g_act == IN: + # L, Gold I --> True iff this entity sunk + return is_sunk + elif g_act == LAST: + # L, Gold L --> True + return True + elif g_act == OUT: + # L, Gold O --> True + return True + elif g_act == UNIT: + # L, Gold U --> True + return True + elif act == OUT: + if g_act == BEGIN: + # O, Gold B --> False + return False + elif g_act == IN: + # O, Gold I --> True + return True + elif g_act == LAST: + # O, Gold L --> True + return True + elif g_act == OUT: + # O, Gold O --> True + return True + elif g_act == UNIT: + # O, Gold U --> False + return False + elif act == UNIT: + if g_act == UNIT: + # U, Gold U --> True iff tag match + return tag == g_tag + else: + # U, Gold B --> False + # U, Gold I --> False + # U, Gold L --> False + # U, Gold O --> False + return False + + +cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0: + cdef int n_accept = 0 + cdef Move* m + for i in range(n_classes): + m = &moves[i] + if m.action == BEGIN: + m.accept = can_begin(s, m.label) + elif m.action == IN: + m.accept = can_in(s, m.label) + elif m.action == LAST: + m.accept = can_last(s, m.label) + elif m.action == UNIT: + m.accept = can_unit(s, m.label) + elif m.action == OUT: + m.accept = can_out(s, m.label) + n_accept += m.accept + return n_accept + + +cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0: + cdef Move* g = &golds[s.i] + cdef ActionType next_act = golds[s.i+1].action if s.i < s.length else OUT + cdef bint is_sunk = entity_is_sunk(s, golds) + cdef Move* m + cdef int n_accept = 0 + for i in range(n_classes): + m = &moves[i] + m.accept = is_oracle(m.action, m.label, g.action, + g.label, next_act, is_sunk) + n_accept += m.accept + return n_accept + + +cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: + cdef int first_accept + for first_accept in range(n): + if moves[first_accept].accept: + break + else: + raise StandardError + cdef int best = first_accept + cdef weight_t score = scores[first_accept] + cdef int i + for i in range(first_accept+1, n): + if moves[i].accept and scores[i] > score: + best = i + score = scores[i] + return &moves[best] + + +cdef int transition(State *s, Move* move) except -1: + if move.action == BEGIN: + begin_entity(s, move.label) + elif move.action == IN: + pass + elif move.action == LAST: + end_entity(s) + elif move.action == UNIT: + begin_entity(s, move.label) + end_entity(s) + elif move.action == OUT: + pass + s.tags[s.i] = move.clas + s.i += 1 + + +def get_n_moves(n_tags): + return n_tags + n_tags + n_tags + n_tags + 1 + + +cdef int fill_moves(Move* moves, int n_tags) except -1: + cdef int i = 0 + for label in range(n_tags): + moves[i].action = BEGIN + moves[i].label = label + i += 1 + for label in range(n_tags): + moves[i].action = IN + moves[i].label = label + for label in range(n_tags): + moves[i].action = LAST + moves[i].label = label + i += 1 + for label in range(n_tags): + moves[i].action = UNIT + moves[i].label = label + i += 1 + moves[i].label == OUT diff --git a/spacy/ner/pystate.pxd b/spacy/ner/pystate.pxd new file mode 100644 index 000000000..cc2333f39 --- /dev/null +++ b/spacy/ner/pystate.pxd @@ -0,0 +1,14 @@ +from cymem.cymem cimport Pool + +from .moves cimport Move +from ._state cimport State + + +cdef class PyState: + cdef Pool mem + cdef readonly list entity_types + cdef readonly int n_classes + cdef readonly dict moves_by_name + + cdef Move* _moves + cdef State* _s diff --git a/spacy/ner/pystate.pyx b/spacy/ner/pystate.pyx new file mode 100644 index 000000000..810d5d980 --- /dev/null +++ b/spacy/ner/pystate.pyx @@ -0,0 +1,52 @@ +from ._state cimport init_state +from ._state cimport entity_is_open +from .moves cimport fill_moves +from .moves cimport transition +from .moves import get_n_moves +from .moves import ACTION_NAMES + + +cdef class PyState: + def __init__(self, tag_names, n_tokens): + self.mem = Pool() + self.entity_types = tag_names + self.n_classes = get_n_moves(len(self.entity_types)) + assert self.n_classes != 0 + self._moves = self.mem.alloc(self.n_classes, sizeof(Move)) + fill_moves(self._moves, len(self.entity_types)) + self._s = init_state(self.mem, n_tokens) + self.moves_by_name = {} + for i in range(self.n_classes): + m = &self._moves[i] + action_name = ACTION_NAMES[m.action] + tag_name = tag_names[m.label] + self.moves_by_name['%s-%s' % (action_name, tag_name)] = i + + def transition(self, unicode move_name): + cdef int m_i = self.moves_by_name[move_name] + cdef Move* m = &self._moves[m_i] + transition(self._s, m) + + def is_valid(self, unicode move_name): + pass + + def is_gold(self, unicode move_name): + pass + + property ent: + def __get__(self): + return self._s.ents[self._s.j] + + property n_ents: + def __get__(self): + return self._s.j + 1 + + property i: + def __get__(self): + return self._s.i + + property open_entity: + def __get__(self): + return entity_is_open(self._s) + +