mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc.
This commit is contained in:
parent
4539c70542
commit
ae235e07b9
|
@ -206,7 +206,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
|||
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
|
||||
Config.write(ner_model_dir, 'config', features=feat_set, seed=seed,
|
||||
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
||||
labels=Language.EntityTransitionSystem.get_labels(gold_tuples))
|
||||
|
||||
nlp = Language()
|
||||
|
@ -214,6 +214,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
|||
for itn in range(n_iter):
|
||||
dep_corr = 0
|
||||
pos_corr = 0
|
||||
ent_corr = 0
|
||||
n_tokens = 0
|
||||
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
||||
if gold_preproc:
|
||||
|
@ -221,14 +222,11 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
|||
else:
|
||||
sents = [nlp.tokenizer(raw_text)]
|
||||
for tokens in sents:
|
||||
|
||||
gold = GoldParse(tokens, annot_tuples, nlp.tags,
|
||||
nlp.parser.moves.label_ids,
|
||||
nlp.entity.moves.label_ids)
|
||||
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
nlp.tagger(tokens)
|
||||
#ent_corr += nlp.entity.train(tokens, gold, force_gold=force_gold)
|
||||
dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold)
|
||||
pos_corr += nlp.tagger.train(tokens, gold.tags_)
|
||||
pos_corr += nlp.tagger.train(tokens, gold.tags)
|
||||
n_tokens += len(tokens)
|
||||
acc = float(dep_corr) / n_tokens
|
||||
pos_acc = float(pos_corr) / n_tokens
|
||||
|
|
|
@ -27,6 +27,13 @@ cdef enum:
|
|||
BREAK
|
||||
N_MOVES
|
||||
|
||||
MOVE_NAMES = [None] * N_MOVES
|
||||
MOVE_NAMES[SHIFT] = 'S'
|
||||
MOVE_NAMES[REDUCE] = 'D'
|
||||
MOVE_NAMES[LEFT] = 'L'
|
||||
MOVE_NAMES[RIGHT] = 'R'
|
||||
MOVE_NAMES[BREAK] = 'B'
|
||||
|
||||
|
||||
cdef do_func_t[N_MOVES] do_funcs
|
||||
cdef get_cost_func_t[N_MOVES] get_cost_funcs
|
||||
|
@ -46,6 +53,23 @@ cdef class ArcEager(TransitionSystem):
|
|||
move_labels[LEFT][label] = True
|
||||
return move_labels
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
for i in range(gold.length):
|
||||
gold.c_heads[i] = gold.heads[i]
|
||||
gold.c_labels[i] = self.label_ids[gold.labels[i]]
|
||||
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
if '-' in name:
|
||||
move_str, label_str = name.split('-', 1)
|
||||
label = self.label_ids[label_str]
|
||||
else:
|
||||
label = 0
|
||||
move = MOVE_NAMES.index(move_str)
|
||||
for i in range(self.n_moves):
|
||||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
# constructor with the function pointers
|
||||
|
|
|
@ -3,31 +3,21 @@ from cymem.cymem cimport Pool
|
|||
from ..structs cimport TokenC
|
||||
from .transition_system cimport Transition
|
||||
|
||||
cimport numpy
|
||||
|
||||
cdef class GoldParse:
|
||||
cdef Pool mem
|
||||
|
||||
cdef int length
|
||||
cdef readonly int loss
|
||||
cdef readonly object ids
|
||||
cdef readonly object tags
|
||||
cdef readonly object heads
|
||||
cdef readonly object labels
|
||||
cdef readonly list tags
|
||||
cdef readonly list heads
|
||||
cdef readonly list labels
|
||||
cdef readonly list ner
|
||||
|
||||
cdef readonly object tags_
|
||||
cdef readonly object labels_
|
||||
cdef readonly object ner_
|
||||
|
||||
cdef Transition* ner
|
||||
cdef int* c_tags
|
||||
cdef int* c_heads
|
||||
cdef int* c_labels
|
||||
cdef Transition* c_ner
|
||||
|
||||
cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1
|
||||
|
||||
|
||||
cdef class NERAnnotation:
|
||||
cdef Pool mem
|
||||
cdef int* starts
|
||||
cdef int* ends
|
||||
cdef int* labels
|
||||
cdef readonly list entities
|
||||
|
|
|
@ -34,38 +34,37 @@ def read_docparse_file(loc):
|
|||
sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
|
||||
return sents
|
||||
|
||||
def _parse_line(line):
|
||||
pieces = line.split()
|
||||
if len(pieces) == 4:
|
||||
return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3]
|
||||
else:
|
||||
id_ = int(pieces[0])
|
||||
word = pieces[1]
|
||||
pos = pieces[3]
|
||||
iob_ent = pieces[5]
|
||||
head_idx = int(pieces[6])
|
||||
label = pieces[7]
|
||||
return id_, word, pos, head_idx, label, iob_ent
|
||||
|
||||
cdef class GoldParse:
|
||||
def __init__(self, tokens, annot_tuples, pos_tags, dep_labels, entity_types):
|
||||
def __init__(self, tokens, annot_tuples):
|
||||
self.mem = Pool()
|
||||
self.loss = 0
|
||||
self.length = len(tokens)
|
||||
self.ids = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
||||
self.tags = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
||||
self.heads = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
||||
self.labels = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
||||
|
||||
self.ids[:] = -1
|
||||
self.tags[:] = -1
|
||||
self.heads[:] = -1
|
||||
self.labels[:] = -1
|
||||
|
||||
self.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
|
||||
# These are filled by the tagger/parser/entity recogniser
|
||||
self.c_tags = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||
self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||
self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||
self.c_ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
|
||||
|
||||
for i in range(len(tokens)):
|
||||
self.c_heads[i] = -1
|
||||
self.c_labels[i] = -1
|
||||
|
||||
self.tags_ = [None] * len(tokens)
|
||||
self.labels_ = [None] * len(tokens)
|
||||
self.ner_ = [None] * len(tokens)
|
||||
self.tags = [None] * len(tokens)
|
||||
self.heads = [-1] * len(tokens)
|
||||
self.labels = ['MISSING'] * len(tokens)
|
||||
self.ner = [None] * len(tokens)
|
||||
|
||||
idx_map = {token.idx: token.i for token in tokens}
|
||||
print idx_map
|
||||
# TODO: Fill NER moves
|
||||
print raw_text
|
||||
for idx, tag, head, label, ner in zip(*annot_tuples):
|
||||
if idx < tokens[0].idx:
|
||||
pass
|
||||
|
@ -73,16 +72,12 @@ cdef class GoldParse:
|
|||
break
|
||||
elif idx in idx_map:
|
||||
i = idx_map[idx]
|
||||
print i, idx, head, idx_map.get(head, -1)
|
||||
self.ids[i] = idx
|
||||
self.tags[i] = pos_tags.index(tag)
|
||||
self.tags[i] = tag
|
||||
self.heads[i] = idx_map.get(head, -1)
|
||||
self.labels[i] = dep_labels[label]
|
||||
self.c_heads[i] = -1
|
||||
self.c_labels[i] = -1
|
||||
self.tags_[i] = tag
|
||||
self.labels_[i] = label
|
||||
self.ner_[i] = ner
|
||||
self.labels[i] = label
|
||||
self.tags[i] = tag
|
||||
self.labels[i] = label
|
||||
self.ner[i] = ner
|
||||
|
||||
@property
|
||||
def n_non_punct(self):
|
||||
|
@ -116,71 +111,3 @@ def _map_indices_to_tokens(ids, heads):
|
|||
return mapped
|
||||
|
||||
|
||||
def _parse_line(line):
|
||||
pieces = line.split()
|
||||
if len(pieces) == 4:
|
||||
return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3]
|
||||
else:
|
||||
id_ = int(pieces[0])
|
||||
word = pieces[1]
|
||||
pos = pieces[3]
|
||||
iob_ent = pieces[5]
|
||||
head_idx = int(pieces[6])
|
||||
label = pieces[7]
|
||||
return id_, word, pos, head_idx, label, iob_ent
|
||||
|
||||
|
||||
cdef class NERAnnotation:
|
||||
def __init__(self, entities, length, entity_types):
|
||||
self.mem = Pool()
|
||||
self.starts = <int*>self.mem.alloc(length, sizeof(int))
|
||||
self.ends = <int*>self.mem.alloc(length, sizeof(int))
|
||||
self.labels = <int*>self.mem.alloc(length, sizeof(int))
|
||||
self.entities = entities
|
||||
memset(self.starts, -1, sizeof(int) * length)
|
||||
memset(self.ends, -1, sizeof(int) * length)
|
||||
memset(self.labels, -1, sizeof(int) * length)
|
||||
|
||||
cdef int start, end, label
|
||||
for start, end, label in entities:
|
||||
for i in range(start, end):
|
||||
self.starts[i] = start
|
||||
self.ends[i] = end
|
||||
self.labels[i] = label
|
||||
@property
|
||||
def biluo_tags(self):
|
||||
pass
|
||||
|
||||
@property
|
||||
def iob_tags(self):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_iobs(cls, iob_strs, entity_types):
|
||||
return cls.from_biluos(iob_to_biluo(iob_strs), entity_types)
|
||||
|
||||
@classmethod
|
||||
def from_biluos(cls, tag_strs, entity_types):
|
||||
entities = []
|
||||
start = None
|
||||
for i, tag_str in enumerate(tag_strs):
|
||||
if tag_str == 'O' or tag_str == '-':
|
||||
continue
|
||||
move, label_str = tag_str.split('-')
|
||||
label = entity_types.index(label_str)
|
||||
if label == -1:
|
||||
label = len(entity_types)
|
||||
entity_types.append(label)
|
||||
if move == 'U':
|
||||
assert start is None
|
||||
entities.append((i, i+1, label))
|
||||
elif move == 'B':
|
||||
assert start is None
|
||||
start = i
|
||||
elif move == 'L':
|
||||
assert start is not None
|
||||
entities.append((start, i+1, label))
|
||||
start = None
|
||||
return cls(entities, len(tag_strs), entity_types)
|
||||
|
||||
|
||||
|
|
|
@ -21,6 +21,14 @@ cdef enum:
|
|||
OUT
|
||||
N_MOVES
|
||||
|
||||
MOVE_NAMES = [None] * N_MOVES
|
||||
MOVE_NAMES[MISSING] = 'M'
|
||||
MOVE_NAMES[BEGIN] = 'B'
|
||||
MOVE_NAMES[IN] = 'I'
|
||||
MOVE_NAMES[LAST] = 'L'
|
||||
MOVE_NAMES[UNIT] = 'U'
|
||||
MOVE_NAMES[OUT] = 'O'
|
||||
|
||||
|
||||
cdef do_func_t[N_MOVES] do_funcs
|
||||
|
||||
|
@ -70,6 +78,23 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
move_labels[moves.index(move_str)][label] = True
|
||||
return move_labels
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
biluo_strings = iob_to_biluo(gold.ner)
|
||||
for i in range(gold.length):
|
||||
gold.c_ner[i] = self.lookup_transition(biluo_strings[i])
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
if '-' in name:
|
||||
move_str, label_str = name.split('-', 1)
|
||||
label = self.label_ids[label_str]
|
||||
else:
|
||||
move_str = name
|
||||
label = 0
|
||||
move = MOVE_NAMES.index(move_str)
|
||||
for i in range(self.n_moves):
|
||||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
# constructor with the function pointers
|
||||
|
@ -101,9 +126,9 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1:
|
||||
if not _is_valid(self.move, self.label, s):
|
||||
return 9000
|
||||
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
||||
cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT
|
||||
return not _is_gold(self.move, self.label, gold.ner[s.i].move, gold.ner[s.i].label,
|
||||
cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner)
|
||||
cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT
|
||||
return not _is_gold(self.move, self.label, gold.c_ner[s.i].move, gold.c_ner[s.i].label,
|
||||
next_act, is_sunk)
|
||||
|
||||
cdef bint _is_gold(int act, int tag, int g_act, int g_tag,
|
||||
|
|
|
@ -34,6 +34,8 @@ from .conll cimport GoldParse
|
|||
from . import _parse_features
|
||||
from ._parse_features cimport fill_context, CONTEXT_SIZE
|
||||
|
||||
from ._ner_features cimport _ner_features
|
||||
|
||||
|
||||
DEBUG = False
|
||||
def set_debug(val):
|
||||
|
@ -55,6 +57,8 @@ def get_templates(name):
|
|||
pf = _parse_features
|
||||
if name == 'zhang':
|
||||
return pf.arc_eager
|
||||
elif name == 'ner':
|
||||
return _ner_features.basic
|
||||
else:
|
||||
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
|
||||
pf.tree_shape + pf.trigrams)
|
||||
|
@ -95,7 +99,8 @@ cdef class GreedyParser:
|
|||
Transition best
|
||||
|
||||
atom_t[CONTEXT_SIZE] context
|
||||
|
||||
|
||||
self.moves.preprocess_gold(gold)
|
||||
cdef Pool mem = Pool()
|
||||
cdef State* state = init_state(mem, tokens.data, tokens.length)
|
||||
while not is_final(state):
|
||||
|
|
|
@ -29,6 +29,10 @@ cdef class TransitionSystem:
|
|||
cdef const Transition* c
|
||||
cdef readonly int n_moves
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *
|
||||
|
||||
cdef Transition best_valid(self, const weight_t* scores, const State* state) except *
|
||||
|
|
|
@ -28,6 +28,12 @@ cdef class TransitionSystem:
|
|||
self.label_ids['MISSING'] = -1
|
||||
self.c = moves
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
raise NotImplementedError
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
raise NotImplementedError
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
raise NotImplementedError
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user