* Work on shift-reduce NER

This commit is contained in:
Matthew Honnibal 2014-11-10 16:28:56 +11:00
parent f307eb2e36
commit 9f2587f5ec
12 changed files with 633 additions and 0 deletions

0
spacy/ner/__init__.pxd Normal file
View File

0
spacy/ner/__init__.py Normal file
View File

0
spacy/ner/_feats.pxd Normal file
View File

169
spacy/ner/_feats.pyx Normal file
View File

@ -0,0 +1,169 @@
from spacy.context cimport FIELD_IDS, Token
cdef Token P4 = FIELD_IDS.P4
cdef Token P3 = FIELD_IDS.P3
cdef Token P2 = FIELD_IDS.P2
cdef Token P1 = FIELD_IDS.P1
cdef Token N0 = FIELD_IDS.N0
cdef Token N1 = FIELD_IDS.N1
cdef Token N2 = FIELD_IDS.N2
cdef Token N3 = FIELD_IDS.N3
cdef Token N4 = FIELD_IDS.N4
"""
TEMPLATES = (
(N0.sic,),
(N0.cluster,),
(P1.pos,),
(P1.sic,),
(N1.norm,),
(N1.pos,),
(P1.ner,),
(P2.ner,),
(N0.cluster,),
(P1.cluster,),
(N1.cluster,),
(N0.is_alpha,),
(N0.is_digit,),
(N0.is_title,),
(N0.is_upper,),
(N0.is_title, N0.oft_title),
(N0.is_upper, N0.oft_upper),
(P1.cluster, N0.norm),
(N0.norm, N1.cluster),
(P1.ner, N0.pos),
(P2.ner, P1.ner, N0.pos),
(P2.pos, P1.pos, N0.sic),
(N0.sic, N1.pos, N2.pos)
)
"""
LOCAL = (
(N0.sic,),
(P1.sic,),
(N1.sic,),
(P2.sic,),
(N2.sic,),
(P3.sic,),
(N3.sic,),
(P4.sic,),
(N4.sic,),
(P1.sic, N0.sic,),
(N0.sic, N1.sic),
(N0.prefix,),
(N0.suffix,),
(P1.shape,),
(N0.shape,),
(N1.shape,),
(P1.shape, N0.shape,),
(N0.shape, P1.shape,),
(P1.shape, N0.shape, N1.shape),
(N2.shape,),
(P2.shape,),
(P3.shape,),
(N3.shape,),
(P4.shape,),
(N4.shape,),
(P2.norm, P1.norm, N0.norm),
(P1.norm, N0.norm, N1.norm),
(N0.norm, N1.norm, N2.norm)
)
BOOLS = (
(N0.is_title,),
)
HISTORY = (
(P1.ner,),
(P1.ner, N0.sic,),
(P2.ner,),
(P2.ner, P1.ner),
(P2.ner, P1.ner, N0.sic),
(P2.pos, P1.ner, N0.pos),
(P2.ner, P1.pos, N0.pos),
(P3.ner,),
(P4.ner,),
)
POS = (
(P4.pos,),
(P3.pos,),
(P2.pos,),
(P1.pos,),
(N0.pos,),
(N1.pos,),
(N2.pos,),
(N3.pos,),
(N4.pos,),
(P1.pos, N0.pos),
(N0.pos, N1.pos),
(P2.pos, P1.pos, N0.pos),
(P1.pos, N0.pos, N1.pos),
(N0.pos, N1.pos, N2.pos)
)
CLUSTERS = (
(P4.cluster,),
(P3.cluster,),
(P2.cluster,),
(P1.cluster,),
(N0.cluster,),
(N1.cluster,),
(N2.cluster,),
(N3.cluster,),
(N4.cluster,),
(P1.cluster, N0.cluster),
(N0.cluster, N1.cluster),
)
CLUSTER_POS = (
(P1.cluster, N0.pos),
(N0.pos, P1.cluster),
(N0.cluster, N1.pos),
(N0.pos, N1.cluster)
)
GAZ = (
(N0.in_males,),
(N0.in_females,),
(N0.in_surnames,),
(N0.in_places,),
(N0.in_games,),
(N0.in_celebs,),
(N0.in_names,),
(P1.in_males,),
(P1.in_females,),
(P1.in_surnames,),
(P1.in_places,),
(P1.in_games,),
(P1.in_celebs,),
(P1.in_names,),
(N1.in_males,),
(N1.in_females,),
(N1.in_surnames,),
(N1.in_places,),
(N1.in_games,),
(N1.in_celebs,),
(N1.in_names,),
)
TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS

27
spacy/ner/_state.pxd Normal file
View File

@ -0,0 +1,27 @@
from cymem.cymem cimport Pool
from .moves cimport Move
cdef struct Entity:
int start
int end
int label
cdef struct State:
Entity* ents
int* tags
int i
int j
int length
cdef int begin_entity(State* s, label) except -1
cdef int end_entity(State* s) except -1
cdef State* init_state(Pool mem, int sent_length) except NULL
cdef bint entity_is_open(State *s) except -1
cdef bint entity_is_sunk(State *s, Move* golds) except -1

40
spacy/ner/_state.pyx Normal file
View File

@ -0,0 +1,40 @@
from .moves cimport BEGIN, UNIT
cdef int begin_entity(State* s, label) except -1:
s.j += 1
s.ents[s.j].start = s.i
s.ents[s.j].label = label
cdef int end_entity(State* s) except -1:
s.ents[s.j].end = s.i + 1
cdef State* init_state(Pool mem, int sent_length) except NULL:
s = <State*>mem.alloc(1, sizeof(State))
s.j = -1
s.ents = <Entity*>mem.alloc(sent_length, sizeof(Entity))
for i in range(sent_length):
s.ents[i].label = -1
s.tags = <int*>mem.alloc(sent_length, sizeof(int))
s.length = sent_length
return s
cdef bint entity_is_open(State *s) except -1:
return s.j >= 0 and s.ents[s.j].label != -1
cdef bint entity_is_sunk(State *s, Move* golds) except -1:
if not entity_is_open(s):
return False
cdef Entity* ent = &s.ents[s.j]
cdef Move* gold = &golds[ent.start]
if gold.action != BEGIN and gold.action != UNIT:
return True
elif gold.label != ent.label:
return True
else:
return False

View File

@ -0,0 +1,25 @@
from cymem.cymem cimport Pool
from thinc.features cimport Extractor
from thinc.learner cimport LinearModel
from thinc.typedefs cimport *
from ..tokens cimport Tokens
from ..typedefs cimport *
from .moves cimport Move
cdef class NERParser:
cdef Pool mem
cdef Extractor extractor
cdef LinearModel model
cdef Move* _moves
cdef atom_t* _context
cdef feat_t* _feats
cdef weight_t* _values
cdef weight_t* _scores
cpdef int train(self, Tokens tokens, golds)
cpdef int set_tags(self, Tokens tokens) except -1

View File

@ -0,0 +1,81 @@
cimport cython
import random
import os
from os import path
import shutil
import json
from thinc.features cimport ConjFeat
from ..context cimport fill_context
from ..context cimport N_FIELDS
from .moves cimport Move
from .moves cimport fill_moves, transition, best_accepted
from .moves cimport set_accept_if_valid, set_accept_if_oracle
from .moves import get_n_moves
from ._state cimport State
from ._state cimport init_state
cdef class NERParser:
def __init__(self, model_dir):
self.mem = Pool()
cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates']
self.entity_types = cfg['entity_types']
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
self.n_classes = get_n_moves(len(self.entity_types))
self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
fill_moves(self._moves, len(self.entity_types))
self.model = LinearModel(len(self.tag_names))
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
cpdef int train(self, Tokens tokens, gold_classes):
cdef Pool mem = Pool()
cdef State* s = init_state(mem, tokens.length)
cdef Move* golds = <Move*>mem.alloc(len(gold_classes), sizeof(Move))
for i, clas in enumerate(gold_classes):
golds[i] = self.moves[clas - 1]
assert golds[i].id == clas
cdef Move* guess
while s.i < tokens.length:
fill_context(self._context, s.i, tokens)
self.extractor.extract(self._feats, self._values, self._context, NULL)
self.model.score(self._scores, self._feats, self._values)
set_accept_if_valid(self._moves, self.n_classes, s)
guess = best_accepted(self._moves, self._scores, self.n_classes)
set_accept_if_oracle(self._moves, golds, self.n_classes, s) # TODO
gold = best_accepted(self._moves, self._scores, self.n_classes)
if guess.clas == gold.clas:
self.model.update({})
return 0
counts = {guess.clas: {}, gold.clas: {}}
self.extractor.count(counts[gold.clas], self._feats, 1)
self.extractor.count(counts[guess.clas], self._feats, -1)
self.model.update(counts)
transition(s, guess)
tokens.ner[s.i-1] = s.tags[s.i-1]
cpdef int set_tags(self, Tokens tokens) except -1:
cdef Pool mem = Pool()
cdef State* s = init_state(mem, tokens.length)
cdef Move* move
while s.i < tokens.length:
fill_context(self._context, s.i, tokens)
self.extractor.extract(self._feats, self._values, self._context, NULL)
self.model.score(self._scores, self._feats, self._values)
set_accept_if_valid(self._moves, self.n_classes, s)
move = best_accepted(self._moves, self._scores, self.n_classes)
transition(s, move)
tokens.ner[s.i-1] = s.tags[s.i-1]

32
spacy/ner/moves.pxd Normal file
View File

@ -0,0 +1,32 @@
from cymem.cymem cimport Pool
from thinc.typedefs cimport class_t
from thinc.typedefs cimport weight_t
from ._state cimport State
cpdef enum ActionType:
BEGIN
IN
LAST
UNIT
OUT
N_ACTIONS
cdef struct Move:
class_t clas
int action
int label
bint accept
cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
cdef int transition(State *s, Move* m) except -1
cdef int fill_moves(Move* moves, int n_tags) except -1

193
spacy/ner/moves.pyx Normal file
View File

@ -0,0 +1,193 @@
from ._state cimport begin_entity
from ._state cimport end_entity
from ._state cimport entity_is_open
from ._state cimport entity_is_sunk
ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
ACTION_NAMES[<int>BEGIN] = 'B'
ACTION_NAMES[<int>IN] = 'I'
ACTION_NAMES[<int>LAST] = 'L'
ACTION_NAMES[<int>UNIT] = 'U'
ACTION_NAMES[<int>OUT] = 'O'
cdef bint can_begin(State* s, int label):
return not entity_is_open(s)
cdef bint can_in(State* s, int label):
return entity_is_open(s) and s.ents[s.j].tag == label
cdef bint can_last(State* s, int label):
return entity_is_open(s) and s.ents[s.j].tag == label
cdef bint can_unit(State* s, int label):
return not entity_is_open(s)
cdef bint can_out(State* s, int label):
return not entity_is_open(s)
cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
ActionType next_act, bint is_sunk):
if act == BEGIN:
if g_act == BEGIN:
# B, Gold B --> Label match
return tag == g_tag
else:
# B, Gold I --> False (P)
# B, Gold L --> False (P)
# B, Gold O --> False (P)
# B, Gold U --> False (P)
return False
elif act == IN:
if g_act == BEGIN:
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
return True
elif g_act == IN:
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
return True
elif g_act == LAST:
# I, Gold L --> True iff this entity sunk and next tag == O
return is_sunk and next_act == OUT
elif g_act == OUT:
# I, Gold O --> True iff next tag == O
return next_act == OUT
elif g_act == UNIT:
# I, Gold U --> True iff next tag == O
return next_act == OUT
elif act == LAST:
if g_act == BEGIN:
# L, Gold B --> True
return True
elif g_act == IN:
# L, Gold I --> True iff this entity sunk
return is_sunk
elif g_act == LAST:
# L, Gold L --> True
return True
elif g_act == OUT:
# L, Gold O --> True
return True
elif g_act == UNIT:
# L, Gold U --> True
return True
elif act == OUT:
if g_act == BEGIN:
# O, Gold B --> False
return False
elif g_act == IN:
# O, Gold I --> True
return True
elif g_act == LAST:
# O, Gold L --> True
return True
elif g_act == OUT:
# O, Gold O --> True
return True
elif g_act == UNIT:
# O, Gold U --> False
return False
elif act == UNIT:
if g_act == UNIT:
# U, Gold U --> True iff tag match
return tag == g_tag
else:
# U, Gold B --> False
# U, Gold I --> False
# U, Gold L --> False
# U, Gold O --> False
return False
cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
cdef int n_accept = 0
cdef Move* m
for i in range(n_classes):
m = &moves[i]
if m.action == BEGIN:
m.accept = can_begin(s, m.label)
elif m.action == IN:
m.accept = can_in(s, m.label)
elif m.action == LAST:
m.accept = can_last(s, m.label)
elif m.action == UNIT:
m.accept = can_unit(s, m.label)
elif m.action == OUT:
m.accept = can_out(s, m.label)
n_accept += m.accept
return n_accept
cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0:
cdef Move* g = &golds[s.i]
cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
cdef bint is_sunk = entity_is_sunk(s, golds)
cdef Move* m
cdef int n_accept = 0
for i in range(n_classes):
m = &moves[i]
m.accept = is_oracle(<ActionType>m.action, m.label, <ActionType>g.action,
g.label, next_act, is_sunk)
n_accept += m.accept
return n_accept
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
cdef int first_accept
for first_accept in range(n):
if moves[first_accept].accept:
break
else:
raise StandardError
cdef int best = first_accept
cdef weight_t score = scores[first_accept]
cdef int i
for i in range(first_accept+1, n):
if moves[i].accept and scores[i] > score:
best = i
score = scores[i]
return &moves[best]
cdef int transition(State *s, Move* move) except -1:
if move.action == BEGIN:
begin_entity(s, move.label)
elif move.action == IN:
pass
elif move.action == LAST:
end_entity(s)
elif move.action == UNIT:
begin_entity(s, move.label)
end_entity(s)
elif move.action == OUT:
pass
s.tags[s.i] = move.clas
s.i += 1
def get_n_moves(n_tags):
return n_tags + n_tags + n_tags + n_tags + 1
cdef int fill_moves(Move* moves, int n_tags) except -1:
cdef int i = 0
for label in range(n_tags):
moves[i].action = BEGIN
moves[i].label = label
i += 1
for label in range(n_tags):
moves[i].action = IN
moves[i].label = label
for label in range(n_tags):
moves[i].action = LAST
moves[i].label = label
i += 1
for label in range(n_tags):
moves[i].action = UNIT
moves[i].label = label
i += 1
moves[i].label == OUT

14
spacy/ner/pystate.pxd Normal file
View File

@ -0,0 +1,14 @@
from cymem.cymem cimport Pool
from .moves cimport Move
from ._state cimport State
cdef class PyState:
cdef Pool mem
cdef readonly list entity_types
cdef readonly int n_classes
cdef readonly dict moves_by_name
cdef Move* _moves
cdef State* _s

52
spacy/ner/pystate.pyx Normal file
View File

@ -0,0 +1,52 @@
from ._state cimport init_state
from ._state cimport entity_is_open
from .moves cimport fill_moves
from .moves cimport transition
from .moves import get_n_moves
from .moves import ACTION_NAMES
cdef class PyState:
def __init__(self, tag_names, n_tokens):
self.mem = Pool()
self.entity_types = tag_names
self.n_classes = get_n_moves(len(self.entity_types))
assert self.n_classes != 0
self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
fill_moves(self._moves, len(self.entity_types))
self._s = init_state(self.mem, n_tokens)
self.moves_by_name = {}
for i in range(self.n_classes):
m = &self._moves[i]
action_name = ACTION_NAMES[m.action]
tag_name = tag_names[m.label]
self.moves_by_name['%s-%s' % (action_name, tag_name)] = i
def transition(self, unicode move_name):
cdef int m_i = self.moves_by_name[move_name]
cdef Move* m = &self._moves[m_i]
transition(self._s, m)
def is_valid(self, unicode move_name):
pass
def is_gold(self, unicode move_name):
pass
property ent:
def __get__(self):
return self._s.ents[self._s.j]
property n_ents:
def __get__(self):
return self._s.j + 1
property i:
def __get__(self):
return self._s.i
property open_entity:
def __get__(self):
return entity_is_open(self._s)