* Work on shift-reduce NER

2025-08-09 14:44:52 +03:00 · 2014-11-10 16:28:56 +11:00 · 2014-11-10 16:28:56 +11:00 · 9f2587f5ec
commit 9f2587f5ec
parent f307eb2e36
12 changed files with 633 additions and 0 deletions
--- a/spacy/ner/init.pxd
+++ b/spacy/ner/init.pxd
--- a/spacy/ner/init.py
+++ b/spacy/ner/init.py
--- a/spacy/ner/_feats.pxd
+++ b/spacy/ner/_feats.pxd
--- a/spacy/ner/_feats.pyx
+++ b/spacy/ner/_feats.pyx
@ -0,0 +1,169 @@
+from spacy.context cimport FIELD_IDS, Token
+
+
+cdef Token P4 = FIELD_IDS.P4
+cdef Token P3 = FIELD_IDS.P3
+cdef Token P2 = FIELD_IDS.P2
+cdef Token P1 = FIELD_IDS.P1
+cdef Token N0 = FIELD_IDS.N0
+cdef Token N1 = FIELD_IDS.N1
+cdef Token N2 = FIELD_IDS.N2
+cdef Token N3 = FIELD_IDS.N3
+cdef Token N4 = FIELD_IDS.N4
+
+"""
+TEMPLATES = (
+    (N0.sic,),
+    (N0.cluster,),
+
+    (P1.pos,),
+    (P1.sic,),
+
+    (N1.norm,),
+    (N1.pos,),
+
+    (P1.ner,),
+    (P2.ner,),
+
+    (N0.cluster,),
+    (P1.cluster,),
+    (N1.cluster,),
+
+    (N0.is_alpha,),
+    (N0.is_digit,),
+    (N0.is_title,),
+    (N0.is_upper,),
+
+    (N0.is_title, N0.oft_title),
+    (N0.is_upper, N0.oft_upper),
+
+    (P1.cluster, N0.norm),
+    (N0.norm, N1.cluster),
+
+    (P1.ner, N0.pos),
+    (P2.ner, P1.ner, N0.pos),
+
+    (P2.pos, P1.pos, N0.sic),
+    (N0.sic, N1.pos, N2.pos)
+)
+"""
+
+LOCAL = (
+    (N0.sic,),
+    (P1.sic,),
+    (N1.sic,),
+    (P2.sic,),
+    (N2.sic,),
+    (P3.sic,),
+    (N3.sic,),
+    (P4.sic,),
+    (N4.sic,),
+    
+    (P1.sic, N0.sic,),
+    (N0.sic, N1.sic),
+    
+    (N0.prefix,),
+    (N0.suffix,),
+
+    (P1.shape,),
+    (N0.shape,),
+    (N1.shape,),
+    (P1.shape, N0.shape,),
+    (N0.shape, P1.shape,),
+    (P1.shape, N0.shape, N1.shape),
+    (N2.shape,),
+    (P2.shape,),
+    (P3.shape,),
+    (N3.shape,),
+    (P4.shape,),
+    (N4.shape,),
+
+    (P2.norm, P1.norm, N0.norm),
+    (P1.norm, N0.norm, N1.norm),
+    (N0.norm, N1.norm, N2.norm)
+)
+
+BOOLS = (
+    (N0.is_title,),
+)
+
+
+HISTORY = (
+    (P1.ner,),
+    (P1.ner, N0.sic,),
+    (P2.ner,),
+    (P2.ner, P1.ner),
+    (P2.ner, P1.ner, N0.sic),
+    (P2.pos, P1.ner, N0.pos),
+    (P2.ner, P1.pos, N0.pos),
+    (P3.ner,),
+    (P4.ner,),
+)
+
+POS = (
+    (P4.pos,),
+    (P3.pos,),
+    (P2.pos,),
+    (P1.pos,),
+    (N0.pos,),
+    (N1.pos,),
+    (N2.pos,),
+    (N3.pos,),
+    (N4.pos,),
+
+    (P1.pos, N0.pos),
+    (N0.pos, N1.pos),
+    (P2.pos, P1.pos, N0.pos),
+    (P1.pos, N0.pos, N1.pos),
+    (N0.pos, N1.pos, N2.pos)
+)
+
+CLUSTERS = (
+    (P4.cluster,),
+    (P3.cluster,),
+    (P2.cluster,),
+    (P1.cluster,),
+    (N0.cluster,),
+    (N1.cluster,),
+    (N2.cluster,),
+    (N3.cluster,),
+    (N4.cluster,),
+
+    (P1.cluster, N0.cluster),
+    (N0.cluster, N1.cluster),
+)
+
+
+CLUSTER_POS = (
+    (P1.cluster, N0.pos),
+    (N0.pos, P1.cluster),
+    (N0.cluster, N1.pos),
+    (N0.pos, N1.cluster)
+)
+
+
+GAZ = (
+    (N0.in_males,),
+    (N0.in_females,),
+    (N0.in_surnames,),
+    (N0.in_places,),
+    (N0.in_games,),
+    (N0.in_celebs,),
+    (N0.in_names,),
+    (P1.in_males,),
+    (P1.in_females,),
+    (P1.in_surnames,),
+    (P1.in_places,),
+    (P1.in_games,),
+    (P1.in_celebs,),
+    (P1.in_names,),
+    (N1.in_males,),
+    (N1.in_females,),
+    (N1.in_surnames,),
+    (N1.in_places,),
+    (N1.in_games,),
+    (N1.in_celebs,),
+    (N1.in_names,),
+)
+
+TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS
--- a/spacy/ner/_state.pxd
+++ b/spacy/ner/_state.pxd
@ -0,0 +1,27 @@
+from cymem.cymem cimport Pool
+from .moves cimport Move
+
+
+cdef struct Entity:
+    int start
+    int end
+    int label
+
+
+cdef struct State:
+    Entity* ents
+    int* tags
+    int i
+    int j
+    int length
+
+
+cdef int begin_entity(State* s, label) except -1
+
+cdef int end_entity(State* s) except -1
+
+cdef State* init_state(Pool mem, int sent_length) except NULL
+
+cdef bint entity_is_open(State *s) except -1
+
+cdef bint entity_is_sunk(State *s, Move* golds) except -1
--- a/spacy/ner/_state.pyx
+++ b/spacy/ner/_state.pyx
@ -0,0 +1,40 @@
+from .moves cimport BEGIN, UNIT
+
+
+cdef int begin_entity(State* s, label) except -1:
+    s.j += 1
+    s.ents[s.j].start = s.i
+    s.ents[s.j].label = label
+
+
+cdef int end_entity(State* s) except -1:
+    s.ents[s.j].end = s.i + 1
+
+
+cdef State* init_state(Pool mem, int sent_length) except NULL:
+    s = <State*>mem.alloc(1, sizeof(State))
+    s.j = -1
+    s.ents = <Entity*>mem.alloc(sent_length, sizeof(Entity))
+    for i in range(sent_length):
+        s.ents[i].label = -1
+    s.tags = <int*>mem.alloc(sent_length, sizeof(int))
+    s.length = sent_length
+    return s
+
+
+cdef bint entity_is_open(State *s) except -1:
+    return s.j >= 0 and s.ents[s.j].label != -1
+
+
+cdef bint entity_is_sunk(State *s, Move* golds) except -1:
+    if not entity_is_open(s):
+        return False
+
+    cdef Entity* ent = &s.ents[s.j]
+    cdef Move* gold = &golds[ent.start]
+    if gold.action != BEGIN and gold.action != UNIT:
+        return True
+    elif gold.label != ent.label:
+        return True
+    else:
+        return False
--- a/spacy/ner/greedy_parser.pxd
+++ b/spacy/ner/greedy_parser.pxd
@ -0,0 +1,25 @@
+from cymem.cymem cimport Pool
+from thinc.features cimport Extractor
+from thinc.learner cimport LinearModel
+from thinc.typedefs cimport *
+
+from ..tokens cimport Tokens
+from ..typedefs cimport *
+
+from .moves cimport Move
+
+
+cdef class NERParser:
+    cdef Pool mem
+    cdef Extractor extractor
+    cdef LinearModel model
+
+    cdef Move* _moves
+    cdef atom_t* _context
+    cdef feat_t* _feats
+    cdef weight_t* _values
+    cdef weight_t* _scores
+
+
+    cpdef int train(self, Tokens tokens, golds)
+    cpdef int set_tags(self, Tokens tokens) except -1
--- a/spacy/ner/greedy_parser.pyx
+++ b/spacy/ner/greedy_parser.pyx
@ -0,0 +1,81 @@
+cimport cython
+import random
+import os
+from os import path
+import shutil
+import json
+
+from thinc.features cimport ConjFeat
+
+from ..context cimport fill_context
+from ..context cimport N_FIELDS
+from .moves cimport Move
+from .moves cimport fill_moves, transition, best_accepted
+from .moves cimport set_accept_if_valid, set_accept_if_oracle
+from .moves import get_n_moves
+from ._state cimport State
+from ._state cimport init_state
+
+
+cdef class NERParser:
+    def __init__(self, model_dir):
+        self.mem = Pool()
+        cfg = json.load(open(path.join(model_dir, 'config.json')))
+        templates = cfg['templates']
+        self.entity_types = cfg['entity_types']
+        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
+        self.n_classes = get_n_moves(len(self.entity_types))
+        self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
+        fill_moves(self._moves, len(self.entity_types))
+        self.model = LinearModel(len(self.tag_names))
+        if path.exists(path.join(model_dir, 'model')):
+            self.model.load(path.join(model_dir, 'model'))
+
+        self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
+        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
+        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
+        self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
+
+    cpdef int train(self, Tokens tokens, gold_classes):
+        cdef Pool mem = Pool()
+        cdef State* s = init_state(mem, tokens.length)
+        cdef Move* golds = <Move*>mem.alloc(len(gold_classes), sizeof(Move))
+        for i, clas in enumerate(gold_classes):
+            golds[i] = self.moves[clas - 1]
+            assert golds[i].id == clas
+        cdef Move* guess
+        while s.i < tokens.length:
+            fill_context(self._context, s.i, tokens)
+            self.extractor.extract(self._feats, self._values, self._context, NULL)
+            self.model.score(self._scores, self._feats, self._values)
+            
+            set_accept_if_valid(self._moves, self.n_classes, s)
+            guess = best_accepted(self._moves, self._scores, self.n_classes)
+
+            set_accept_if_oracle(self._moves, golds, self.n_classes, s) # TODO
+            gold = best_accepted(self._moves, self._scores, self.n_classes)
+
+            if guess.clas == gold.clas:
+                self.model.update({})
+                return 0
+
+            counts = {guess.clas: {}, gold.clas: {}}
+            self.extractor.count(counts[gold.clas], self._feats, 1)
+            self.extractor.count(counts[guess.clas], self._feats, -1)
+            self.model.update(counts)
+
+            transition(s, guess)
+            tokens.ner[s.i-1] = s.tags[s.i-1]
+
+    cpdef int set_tags(self, Tokens tokens) except -1:
+        cdef Pool mem = Pool()
+        cdef State* s = init_state(mem, tokens.length)
+        cdef Move* move
+        while s.i < tokens.length:
+            fill_context(self._context, s.i, tokens)
+            self.extractor.extract(self._feats, self._values, self._context, NULL)
+            self.model.score(self._scores, self._feats, self._values)
+            set_accept_if_valid(self._moves, self.n_classes, s)
+            move = best_accepted(self._moves, self._scores, self.n_classes)
+            transition(s, move)
+            tokens.ner[s.i-1] = s.tags[s.i-1]
--- a/spacy/ner/moves.pxd
+++ b/spacy/ner/moves.pxd
@ -0,0 +1,32 @@
+from cymem.cymem cimport Pool
+
+from thinc.typedefs cimport class_t
+from thinc.typedefs cimport weight_t
+
+from ._state cimport State
+
+cpdef enum ActionType:
+    BEGIN
+    IN
+    LAST
+    UNIT
+    OUT
+    N_ACTIONS
+
+
+cdef struct Move:
+    class_t clas
+    int action
+    int label
+    bint accept
+
+
+cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0
+
+cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
+
+cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
+
+cdef int transition(State *s, Move* m) except -1
+
+cdef int fill_moves(Move* moves, int n_tags) except -1
--- a/spacy/ner/moves.pyx
+++ b/spacy/ner/moves.pyx
@ -0,0 +1,193 @@
+from ._state cimport begin_entity
+from ._state cimport end_entity
+from ._state cimport entity_is_open
+from ._state cimport entity_is_sunk
+
+ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
+ACTION_NAMES[<int>BEGIN] = 'B'
+ACTION_NAMES[<int>IN] = 'I'
+ACTION_NAMES[<int>LAST] = 'L'
+ACTION_NAMES[<int>UNIT] = 'U'
+ACTION_NAMES[<int>OUT] = 'O'
+
+
+cdef bint can_begin(State* s, int label):
+    return not entity_is_open(s)
+
+
+cdef bint can_in(State* s, int label):
+    return entity_is_open(s) and s.ents[s.j].tag == label
+
+
+cdef bint can_last(State* s, int label):
+    return entity_is_open(s) and s.ents[s.j].tag == label
+
+
+cdef bint can_unit(State* s, int label):
+    return not entity_is_open(s)
+
+
+cdef bint can_out(State* s, int label):
+    return not entity_is_open(s)
+
+
+cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
+                    ActionType next_act, bint is_sunk):
+    if act == BEGIN:
+        if g_act == BEGIN:
+            # B, Gold B --> Label match
+            return tag == g_tag
+        else:
+            # B, Gold I --> False (P)
+            # B, Gold L --> False (P)
+            # B, Gold O --> False (P)
+            # B, Gold U --> False (P)
+            return False
+    elif act == IN:
+        if g_act == BEGIN:
+            # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
+            return True
+        elif g_act == IN:
+            # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
+            return True
+        elif g_act == LAST:
+            # I, Gold L --> True iff this entity sunk and next tag == O
+            return is_sunk and next_act == OUT
+        elif g_act == OUT:
+            # I, Gold O --> True iff next tag == O
+            return next_act == OUT
+        elif g_act == UNIT:
+            # I, Gold U --> True iff next tag == O
+            return next_act == OUT
+    elif act == LAST:
+        if g_act == BEGIN:
+            # L, Gold B --> True
+            return True
+        elif g_act == IN:
+            # L, Gold I --> True iff this entity sunk
+            return is_sunk
+        elif g_act == LAST:
+            # L, Gold L --> True
+            return True
+        elif g_act == OUT:
+            # L, Gold O --> True
+            return True
+        elif g_act == UNIT:
+            # L, Gold U --> True
+            return True
+    elif act == OUT:
+        if g_act == BEGIN:
+            # O, Gold B --> False
+            return False
+        elif g_act == IN:
+            # O, Gold I --> True
+            return True
+        elif g_act == LAST:
+            # O, Gold L --> True
+            return True
+        elif g_act == OUT:
+            # O, Gold O --> True
+            return True
+        elif g_act == UNIT:
+            # O, Gold U --> False
+            return False
+    elif act == UNIT:
+        if g_act == UNIT:
+            # U, Gold U --> True iff tag match
+            return tag == g_tag
+        else:
+            # U, Gold B --> False
+            # U, Gold I --> False
+            # U, Gold L --> False
+            # U, Gold O --> False
+            return False
+    
+
+cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
+    cdef int n_accept = 0
+    cdef Move* m
+    for i in range(n_classes):
+        m = &moves[i]
+        if m.action == BEGIN:
+            m.accept = can_begin(s, m.label)
+        elif m.action == IN:
+            m.accept = can_in(s, m.label)
+        elif m.action == LAST:
+            m.accept = can_last(s, m.label)
+        elif m.action == UNIT:
+            m.accept = can_unit(s, m.label)
+        elif m.action == OUT:
+            m.accept = can_out(s, m.label)
+        n_accept += m.accept
+    return n_accept
+
+
+cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0:
+    cdef Move* g = &golds[s.i]
+    cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
+    cdef bint is_sunk = entity_is_sunk(s, golds)
+    cdef Move* m
+    cdef int n_accept = 0
+    for i in range(n_classes):
+        m = &moves[i]
+        m.accept = is_oracle(<ActionType>m.action, m.label, <ActionType>g.action,
+                             g.label, next_act, is_sunk)
+        n_accept += m.accept
+    return n_accept
+
+
+cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
+    cdef int first_accept
+    for first_accept in range(n):
+        if moves[first_accept].accept:
+            break
+    else:
+        raise StandardError
+    cdef int best = first_accept
+    cdef weight_t score = scores[first_accept]
+    cdef int i
+    for i in range(first_accept+1, n): 
+        if moves[i].accept and scores[i] > score:
+            best = i
+            score = scores[i]
+    return &moves[best]
+
+
+cdef int transition(State *s, Move* move) except -1:
+    if move.action == BEGIN:
+        begin_entity(s, move.label)
+    elif move.action == IN:
+        pass
+    elif move.action == LAST:
+        end_entity(s)
+    elif move.action == UNIT:
+        begin_entity(s, move.label)
+        end_entity(s)
+    elif move.action == OUT:
+        pass
+    s.tags[s.i] = move.clas 
+    s.i += 1
+
+
+def get_n_moves(n_tags):
+    return n_tags + n_tags + n_tags + n_tags + 1
+
+
+cdef int fill_moves(Move* moves, int n_tags) except -1:
+    cdef int i = 0
+    for label in range(n_tags):
+        moves[i].action = BEGIN
+        moves[i].label = label
+        i += 1
+    for label in range(n_tags):
+        moves[i].action = IN
+        moves[i].label = label
+    for label in range(n_tags):
+        moves[i].action = LAST
+        moves[i].label = label
+        i += 1
+    for label in range(n_tags):
+        moves[i].action = UNIT
+        moves[i].label = label
+        i += 1
+    moves[i].label == OUT
--- a/spacy/ner/pystate.pxd
+++ b/spacy/ner/pystate.pxd
@ -0,0 +1,14 @@
+from cymem.cymem cimport Pool
+
+from .moves cimport Move
+from ._state cimport State
+
+
+cdef class PyState:
+    cdef Pool mem
+    cdef readonly list entity_types
+    cdef readonly int n_classes
+    cdef readonly dict moves_by_name
+    
+    cdef Move* _moves
+    cdef State* _s
--- a/spacy/ner/pystate.pyx
+++ b/spacy/ner/pystate.pyx
@ -0,0 +1,52 @@
+from ._state cimport init_state
+from ._state cimport entity_is_open
+from .moves cimport fill_moves
+from .moves cimport transition
+from .moves import get_n_moves
+from .moves import ACTION_NAMES
+
+
+cdef class PyState:
+    def __init__(self, tag_names, n_tokens):
+        self.mem = Pool()
+        self.entity_types = tag_names
+        self.n_classes = get_n_moves(len(self.entity_types))
+        assert self.n_classes != 0
+        self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
+        fill_moves(self._moves, len(self.entity_types))
+        self._s = init_state(self.mem, n_tokens)
+        self.moves_by_name = {}
+        for i in range(self.n_classes):
+            m = &self._moves[i]
+            action_name = ACTION_NAMES[m.action]
+            tag_name = tag_names[m.label]
+            self.moves_by_name['%s-%s' % (action_name, tag_name)] = i
+
+    def transition(self, unicode move_name):
+        cdef int m_i = self.moves_by_name[move_name]
+        cdef Move* m = &self._moves[m_i]
+        transition(self._s, m)
+
+    def is_valid(self, unicode move_name):
+        pass
+
+    def is_gold(self, unicode move_name):
+        pass
+
+    property ent:
+        def __get__(self):
+            return self._s.ents[self._s.j]
+
+    property n_ents:
+        def __get__(self):
+            return self._s.j + 1
+
+    property i:
+        def __get__(self):
+            return self._s.i
+
+    property open_entity:
+        def __get__(self):
+            return entity_is_open(self._s)
+
+