From 3d0570685cdf3bc2229f7a3ac62c78074ced6933 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 2 Feb 2015 16:38:52 +1100 Subject: [PATCH] * Add NER transition system --- spacy/syntax/ner.pxd | 28 ++++++ spacy/syntax/ner.pyx | 215 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 243 insertions(+) create mode 100644 spacy/syntax/ner.pxd create mode 100644 spacy/syntax/ner.pyx diff --git a/spacy/syntax/ner.pxd b/spacy/syntax/ner.pxd new file mode 100644 index 000000000..da8163e51 --- /dev/null +++ b/spacy/syntax/ner.pxd @@ -0,0 +1,28 @@ +from cymem.cymem cimport Pool + +from thinc.typedefs cimport weight_t + + +from ._state cimport State + + +cdef struct Transition: + int clas + int move + int label + int cost + weight_t score + + +cdef class TransitionSystem: + cdef Pool mem + cdef readonly int n_moves + cdef dict label_ids + + cdef const Transition* _moves + + cdef Transition best_valid(self, const weight_t* scores, const State* s) except * + cdef Transition best_gold(self, Transition* guess, const weight_t* scores, + const State* s, + const int* gold_heads, const int* gold_labels) except * + cdef int transition(self, State *s, const Transition* t) except -1 diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx new file mode 100644 index 000000000..b66bd11fd --- /dev/null +++ b/spacy/syntax/ner.pyx @@ -0,0 +1,215 @@ +from __future__ import unicode_literals + +from ._state cimport State +from ._state cimport has_head, get_idx, get_s0, get_n0 +from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep +from ._state cimport head_in_buffer, children_in_buffer +from ._state cimport head_in_stack, children_in_stack + +from ..structs cimport TokenC + + +DEF NON_MONOTONIC = True +DEF USE_BREAK = True + + +cdef enum: + MISSING + BEGIN + IN + LAST + UNIT + OUT + N_MOVES + + +cdef int is_valid(ActionType act, int label, State* s) except -1: + if act == BEGIN: + return not entity_is_open(s) + elif act == IN: + return entity_is_open(s) and s.curr.label == label + elif act == LAST: + return entity_is_open(s) and s.curr.label == label + elif act == UNIT: + return not entity_is_open(s) + elif act == OUT: + return not entity_is_open(s) + else: + raise UnknownMove(act, label) + + +cdef bint is_gold(ActionType act, int tag, ActionType g_act, int g_tag, + ActionType next_act, bint is_sunk): + if g_act == MISSING: + return True + if act == BEGIN: + if g_act == BEGIN: + # B, Gold B --> Label match + return tag == g_tag + else: + # B, Gold I --> False (P) + # B, Gold L --> False (P) + # B, Gold O --> False (P) + # B, Gold U --> False (P) + return False + elif act == IN: + if g_act == BEGIN: + # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk) + return True + elif g_act == IN: + # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk) + return True + elif g_act == LAST: + # I, Gold L --> True iff this entity sunk and next tag == O + return is_sunk and (next_act == OUT or next_act == MISSING) + elif g_act == OUT: + # I, Gold O --> True iff next tag == O + return next_act == OUT or next_act == MISSING + elif g_act == UNIT: + # I, Gold U --> True iff next tag == O + return next_act == OUT + elif act == LAST: + if g_act == BEGIN: + # L, Gold B --> True + return True + elif g_act == IN: + # L, Gold I --> True iff this entity sunk + return is_sunk + elif g_act == LAST: + # L, Gold L --> True + return True + elif g_act == OUT: + # L, Gold O --> True + return True + elif g_act == UNIT: + # L, Gold U --> True + return True + elif act == OUT: + if g_act == BEGIN: + # O, Gold B --> False + return False + elif g_act == IN: + # O, Gold I --> True + return True + elif g_act == LAST: + # O, Gold L --> True + return True + elif g_act == OUT: + # O, Gold O --> True + return True + elif g_act == UNIT: + # O, Gold U --> False + return False + elif act == UNIT: + if g_act == UNIT: + # U, Gold U --> True iff tag match + return tag == g_tag + else: + # U, Gold B --> False + # U, Gold I --> False + # U, Gold L --> False + # U, Gold O --> False + return False + + +cdef bint entity_is_open(State *s) except -1: + return s.sent[s.i - 1].ent.tag >= 1 + + +cdef bint entity_is_sunk(State *s, Move* golds) except -1: + if not entity_is_open(s): + return False + + cdef const Entity* curr = &s.sent[s.i - 1].ent + cdef Move* gold = &golds[(s.i - 1) + curr.start] + if gold.action != BEGIN and gold.action != UNIT: + return True + elif gold.label != s.curr.label: + return True + else: + return False + + +cdef class TransitionSystem: + def __init__(self, list entity_type_strs): + self.mem = Pool() + + cdef Move* m + label_names = {'-': 0} + for i, tag_name in enumerate(tag_names): + m = &moves[i] + if '-' in tag_name: + action_str, label = tag_name.split('-') + elif tag_name == 'O': + action_str = 'O' + label = '-' + elif tag_name == 'NULL' or tag_name == 'EOL': + action_str = '?' + label = '-' + else: + raise StandardError(tag_name) + m.action = ACTION_NAMES.index(action_str) + m.label = label_names.setdefault(label, len(label_names)) + m.clas = i + + cdef int transition(self, State *s, Move* move) except -1: + if move.action == BEGIN: + s.curr.start = s.i + s.curr.label = label + elif move.action == IN: + pass + elif move.action == LAST: + s.curr.end = s.i + s.ents[s.j] = s.curr + s.j += 1 + s.curr.start = 0 + s.curr.label = -1 + s.curr.end = 0 + elif move.action == UNIT: + begin_entity(s, move.label) + end_entity(s) + elif move.action == OUT: + pass + s.tags[s.i] = move.clas + s.i += 1 + + cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: + cdef int best = -1 + cdef weight_t score = -90000 + cdef const Transition* m + cdef int i + for i in range(self.n_moves): + m = &self._moves[i] + if _is_valid(s, m.ent_move, m.ent_label) and scores[i] > score: + best = i + score = scores[i] + assert best >= 0 + cdef Transition t = self._moves[best] + t.score = score + return t + + cdef Transition best_gold(self, Transition* guess, const weight_t* scores, + const State* s, Move* golds) except *: + + cdef Move* g = &golds[s.i] + cdef ActionType next_act = golds[s.i+1].action if s.i < s.length else OUT + cdef bint is_sunk = entity_is_sunk(s, golds) + cdef Move* m + cdef int n_accept = 0 + for i in range(1, self.n_classes): + m = &moves[i] + if _is_valid(s, m.move, m.label) and \ + _is_gold(s, m.move, m.label, next_act, is_sunk) and \ + scores[i] > score: + best = i + score = scores[i] + assert best >= 0 + return self._moves[best] + + +class OracleError(Exception): + pass + + +class UnknownMove(Exception): + pass