mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc.
This commit is contained in:
parent
4539c70542
commit
ae235e07b9
|
@ -206,7 +206,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||||
|
|
||||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||||
labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
|
labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
|
||||||
Config.write(ner_model_dir, 'config', features=feat_set, seed=seed,
|
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
||||||
labels=Language.EntityTransitionSystem.get_labels(gold_tuples))
|
labels=Language.EntityTransitionSystem.get_labels(gold_tuples))
|
||||||
|
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
@ -214,6 +214,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
dep_corr = 0
|
dep_corr = 0
|
||||||
pos_corr = 0
|
pos_corr = 0
|
||||||
|
ent_corr = 0
|
||||||
n_tokens = 0
|
n_tokens = 0
|
||||||
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
|
@ -221,14 +222,11 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||||
else:
|
else:
|
||||||
sents = [nlp.tokenizer(raw_text)]
|
sents = [nlp.tokenizer(raw_text)]
|
||||||
for tokens in sents:
|
for tokens in sents:
|
||||||
|
gold = GoldParse(tokens, annot_tuples)
|
||||||
gold = GoldParse(tokens, annot_tuples, nlp.tags,
|
|
||||||
nlp.parser.moves.label_ids,
|
|
||||||
nlp.entity.moves.label_ids)
|
|
||||||
|
|
||||||
nlp.tagger(tokens)
|
nlp.tagger(tokens)
|
||||||
|
#ent_corr += nlp.entity.train(tokens, gold, force_gold=force_gold)
|
||||||
dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold)
|
dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold)
|
||||||
pos_corr += nlp.tagger.train(tokens, gold.tags_)
|
pos_corr += nlp.tagger.train(tokens, gold.tags)
|
||||||
n_tokens += len(tokens)
|
n_tokens += len(tokens)
|
||||||
acc = float(dep_corr) / n_tokens
|
acc = float(dep_corr) / n_tokens
|
||||||
pos_acc = float(pos_corr) / n_tokens
|
pos_acc = float(pos_corr) / n_tokens
|
||||||
|
|
|
@ -27,6 +27,13 @@ cdef enum:
|
||||||
BREAK
|
BREAK
|
||||||
N_MOVES
|
N_MOVES
|
||||||
|
|
||||||
|
MOVE_NAMES = [None] * N_MOVES
|
||||||
|
MOVE_NAMES[SHIFT] = 'S'
|
||||||
|
MOVE_NAMES[REDUCE] = 'D'
|
||||||
|
MOVE_NAMES[LEFT] = 'L'
|
||||||
|
MOVE_NAMES[RIGHT] = 'R'
|
||||||
|
MOVE_NAMES[BREAK] = 'B'
|
||||||
|
|
||||||
|
|
||||||
cdef do_func_t[N_MOVES] do_funcs
|
cdef do_func_t[N_MOVES] do_funcs
|
||||||
cdef get_cost_func_t[N_MOVES] get_cost_funcs
|
cdef get_cost_func_t[N_MOVES] get_cost_funcs
|
||||||
|
@ -46,6 +53,23 @@ cdef class ArcEager(TransitionSystem):
|
||||||
move_labels[LEFT][label] = True
|
move_labels[LEFT][label] = True
|
||||||
return move_labels
|
return move_labels
|
||||||
|
|
||||||
|
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||||
|
for i in range(gold.length):
|
||||||
|
gold.c_heads[i] = gold.heads[i]
|
||||||
|
gold.c_labels[i] = self.label_ids[gold.labels[i]]
|
||||||
|
|
||||||
|
|
||||||
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
|
if '-' in name:
|
||||||
|
move_str, label_str = name.split('-', 1)
|
||||||
|
label = self.label_ids[label_str]
|
||||||
|
else:
|
||||||
|
label = 0
|
||||||
|
move = MOVE_NAMES.index(move_str)
|
||||||
|
for i in range(self.n_moves):
|
||||||
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
|
return self.c[i]
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||||
# constructor with the function pointers
|
# constructor with the function pointers
|
||||||
|
|
|
@ -3,31 +3,21 @@ from cymem.cymem cimport Pool
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from .transition_system cimport Transition
|
from .transition_system cimport Transition
|
||||||
|
|
||||||
|
cimport numpy
|
||||||
|
|
||||||
cdef class GoldParse:
|
cdef class GoldParse:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
|
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef readonly int loss
|
cdef readonly int loss
|
||||||
cdef readonly object ids
|
cdef readonly list tags
|
||||||
cdef readonly object tags
|
cdef readonly list heads
|
||||||
cdef readonly object heads
|
cdef readonly list labels
|
||||||
cdef readonly object labels
|
cdef readonly list ner
|
||||||
|
|
||||||
cdef readonly object tags_
|
cdef int* c_tags
|
||||||
cdef readonly object labels_
|
|
||||||
cdef readonly object ner_
|
|
||||||
|
|
||||||
cdef Transition* ner
|
|
||||||
cdef int* c_heads
|
cdef int* c_heads
|
||||||
cdef int* c_labels
|
cdef int* c_labels
|
||||||
|
cdef Transition* c_ner
|
||||||
|
|
||||||
cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1
|
cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1
|
||||||
|
|
||||||
|
|
||||||
cdef class NERAnnotation:
|
|
||||||
cdef Pool mem
|
|
||||||
cdef int* starts
|
|
||||||
cdef int* ends
|
|
||||||
cdef int* labels
|
|
||||||
cdef readonly list entities
|
|
||||||
|
|
|
@ -34,38 +34,37 @@ def read_docparse_file(loc):
|
||||||
sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
|
sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
|
||||||
return sents
|
return sents
|
||||||
|
|
||||||
|
def _parse_line(line):
|
||||||
|
pieces = line.split()
|
||||||
|
if len(pieces) == 4:
|
||||||
|
return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3]
|
||||||
|
else:
|
||||||
|
id_ = int(pieces[0])
|
||||||
|
word = pieces[1]
|
||||||
|
pos = pieces[3]
|
||||||
|
iob_ent = pieces[5]
|
||||||
|
head_idx = int(pieces[6])
|
||||||
|
label = pieces[7]
|
||||||
|
return id_, word, pos, head_idx, label, iob_ent
|
||||||
|
|
||||||
cdef class GoldParse:
|
cdef class GoldParse:
|
||||||
def __init__(self, tokens, annot_tuples, pos_tags, dep_labels, entity_types):
|
def __init__(self, tokens, annot_tuples):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.loss = 0
|
self.loss = 0
|
||||||
self.length = len(tokens)
|
self.length = len(tokens)
|
||||||
self.ids = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
|
||||||
self.tags = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
|
||||||
self.heads = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
|
||||||
self.labels = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
|
||||||
|
|
||||||
self.ids[:] = -1
|
# These are filled by the tagger/parser/entity recogniser
|
||||||
self.tags[:] = -1
|
self.c_tags = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||||
self.heads[:] = -1
|
|
||||||
self.labels[:] = -1
|
|
||||||
|
|
||||||
self.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
|
|
||||||
self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||||
self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||||
|
self.c_ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
|
||||||
|
|
||||||
for i in range(len(tokens)):
|
self.tags = [None] * len(tokens)
|
||||||
self.c_heads[i] = -1
|
self.heads = [-1] * len(tokens)
|
||||||
self.c_labels[i] = -1
|
self.labels = ['MISSING'] * len(tokens)
|
||||||
|
self.ner = [None] * len(tokens)
|
||||||
self.tags_ = [None] * len(tokens)
|
|
||||||
self.labels_ = [None] * len(tokens)
|
|
||||||
self.ner_ = [None] * len(tokens)
|
|
||||||
|
|
||||||
idx_map = {token.idx: token.i for token in tokens}
|
idx_map = {token.idx: token.i for token in tokens}
|
||||||
print idx_map
|
|
||||||
# TODO: Fill NER moves
|
|
||||||
print raw_text
|
|
||||||
for idx, tag, head, label, ner in zip(*annot_tuples):
|
for idx, tag, head, label, ner in zip(*annot_tuples):
|
||||||
if idx < tokens[0].idx:
|
if idx < tokens[0].idx:
|
||||||
pass
|
pass
|
||||||
|
@ -73,16 +72,12 @@ cdef class GoldParse:
|
||||||
break
|
break
|
||||||
elif idx in idx_map:
|
elif idx in idx_map:
|
||||||
i = idx_map[idx]
|
i = idx_map[idx]
|
||||||
print i, idx, head, idx_map.get(head, -1)
|
self.tags[i] = tag
|
||||||
self.ids[i] = idx
|
|
||||||
self.tags[i] = pos_tags.index(tag)
|
|
||||||
self.heads[i] = idx_map.get(head, -1)
|
self.heads[i] = idx_map.get(head, -1)
|
||||||
self.labels[i] = dep_labels[label]
|
self.labels[i] = label
|
||||||
self.c_heads[i] = -1
|
self.tags[i] = tag
|
||||||
self.c_labels[i] = -1
|
self.labels[i] = label
|
||||||
self.tags_[i] = tag
|
self.ner[i] = ner
|
||||||
self.labels_[i] = label
|
|
||||||
self.ner_[i] = ner
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n_non_punct(self):
|
def n_non_punct(self):
|
||||||
|
@ -116,71 +111,3 @@ def _map_indices_to_tokens(ids, heads):
|
||||||
return mapped
|
return mapped
|
||||||
|
|
||||||
|
|
||||||
def _parse_line(line):
|
|
||||||
pieces = line.split()
|
|
||||||
if len(pieces) == 4:
|
|
||||||
return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3]
|
|
||||||
else:
|
|
||||||
id_ = int(pieces[0])
|
|
||||||
word = pieces[1]
|
|
||||||
pos = pieces[3]
|
|
||||||
iob_ent = pieces[5]
|
|
||||||
head_idx = int(pieces[6])
|
|
||||||
label = pieces[7]
|
|
||||||
return id_, word, pos, head_idx, label, iob_ent
|
|
||||||
|
|
||||||
|
|
||||||
cdef class NERAnnotation:
|
|
||||||
def __init__(self, entities, length, entity_types):
|
|
||||||
self.mem = Pool()
|
|
||||||
self.starts = <int*>self.mem.alloc(length, sizeof(int))
|
|
||||||
self.ends = <int*>self.mem.alloc(length, sizeof(int))
|
|
||||||
self.labels = <int*>self.mem.alloc(length, sizeof(int))
|
|
||||||
self.entities = entities
|
|
||||||
memset(self.starts, -1, sizeof(int) * length)
|
|
||||||
memset(self.ends, -1, sizeof(int) * length)
|
|
||||||
memset(self.labels, -1, sizeof(int) * length)
|
|
||||||
|
|
||||||
cdef int start, end, label
|
|
||||||
for start, end, label in entities:
|
|
||||||
for i in range(start, end):
|
|
||||||
self.starts[i] = start
|
|
||||||
self.ends[i] = end
|
|
||||||
self.labels[i] = label
|
|
||||||
@property
|
|
||||||
def biluo_tags(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@property
|
|
||||||
def iob_tags(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_iobs(cls, iob_strs, entity_types):
|
|
||||||
return cls.from_biluos(iob_to_biluo(iob_strs), entity_types)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_biluos(cls, tag_strs, entity_types):
|
|
||||||
entities = []
|
|
||||||
start = None
|
|
||||||
for i, tag_str in enumerate(tag_strs):
|
|
||||||
if tag_str == 'O' or tag_str == '-':
|
|
||||||
continue
|
|
||||||
move, label_str = tag_str.split('-')
|
|
||||||
label = entity_types.index(label_str)
|
|
||||||
if label == -1:
|
|
||||||
label = len(entity_types)
|
|
||||||
entity_types.append(label)
|
|
||||||
if move == 'U':
|
|
||||||
assert start is None
|
|
||||||
entities.append((i, i+1, label))
|
|
||||||
elif move == 'B':
|
|
||||||
assert start is None
|
|
||||||
start = i
|
|
||||||
elif move == 'L':
|
|
||||||
assert start is not None
|
|
||||||
entities.append((start, i+1, label))
|
|
||||||
start = None
|
|
||||||
return cls(entities, len(tag_strs), entity_types)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,14 @@ cdef enum:
|
||||||
OUT
|
OUT
|
||||||
N_MOVES
|
N_MOVES
|
||||||
|
|
||||||
|
MOVE_NAMES = [None] * N_MOVES
|
||||||
|
MOVE_NAMES[MISSING] = 'M'
|
||||||
|
MOVE_NAMES[BEGIN] = 'B'
|
||||||
|
MOVE_NAMES[IN] = 'I'
|
||||||
|
MOVE_NAMES[LAST] = 'L'
|
||||||
|
MOVE_NAMES[UNIT] = 'U'
|
||||||
|
MOVE_NAMES[OUT] = 'O'
|
||||||
|
|
||||||
|
|
||||||
cdef do_func_t[N_MOVES] do_funcs
|
cdef do_func_t[N_MOVES] do_funcs
|
||||||
|
|
||||||
|
@ -70,6 +78,23 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
move_labels[moves.index(move_str)][label] = True
|
move_labels[moves.index(move_str)][label] = True
|
||||||
return move_labels
|
return move_labels
|
||||||
|
|
||||||
|
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||||
|
biluo_strings = iob_to_biluo(gold.ner)
|
||||||
|
for i in range(gold.length):
|
||||||
|
gold.c_ner[i] = self.lookup_transition(biluo_strings[i])
|
||||||
|
|
||||||
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
|
if '-' in name:
|
||||||
|
move_str, label_str = name.split('-', 1)
|
||||||
|
label = self.label_ids[label_str]
|
||||||
|
else:
|
||||||
|
move_str = name
|
||||||
|
label = 0
|
||||||
|
move = MOVE_NAMES.index(move_str)
|
||||||
|
for i in range(self.n_moves):
|
||||||
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
|
return self.c[i]
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||||
# constructor with the function pointers
|
# constructor with the function pointers
|
||||||
|
@ -101,9 +126,9 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1:
|
cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1:
|
||||||
if not _is_valid(self.move, self.label, s):
|
if not _is_valid(self.move, self.label, s):
|
||||||
return 9000
|
return 9000
|
||||||
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner)
|
||||||
cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT
|
cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT
|
||||||
return not _is_gold(self.move, self.label, gold.ner[s.i].move, gold.ner[s.i].label,
|
return not _is_gold(self.move, self.label, gold.c_ner[s.i].move, gold.c_ner[s.i].label,
|
||||||
next_act, is_sunk)
|
next_act, is_sunk)
|
||||||
|
|
||||||
cdef bint _is_gold(int act, int tag, int g_act, int g_tag,
|
cdef bint _is_gold(int act, int tag, int g_act, int g_tag,
|
||||||
|
|
|
@ -34,6 +34,8 @@ from .conll cimport GoldParse
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
from ._parse_features cimport fill_context, CONTEXT_SIZE
|
from ._parse_features cimport fill_context, CONTEXT_SIZE
|
||||||
|
|
||||||
|
from ._ner_features cimport _ner_features
|
||||||
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
def set_debug(val):
|
def set_debug(val):
|
||||||
|
@ -55,6 +57,8 @@ def get_templates(name):
|
||||||
pf = _parse_features
|
pf = _parse_features
|
||||||
if name == 'zhang':
|
if name == 'zhang':
|
||||||
return pf.arc_eager
|
return pf.arc_eager
|
||||||
|
elif name == 'ner':
|
||||||
|
return _ner_features.basic
|
||||||
else:
|
else:
|
||||||
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
|
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
|
||||||
pf.tree_shape + pf.trigrams)
|
pf.tree_shape + pf.trigrams)
|
||||||
|
@ -95,7 +99,8 @@ cdef class GreedyParser:
|
||||||
Transition best
|
Transition best
|
||||||
|
|
||||||
atom_t[CONTEXT_SIZE] context
|
atom_t[CONTEXT_SIZE] context
|
||||||
|
|
||||||
|
self.moves.preprocess_gold(gold)
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef State* state = init_state(mem, tokens.data, tokens.length)
|
cdef State* state = init_state(mem, tokens.data, tokens.length)
|
||||||
while not is_final(state):
|
while not is_final(state):
|
||||||
|
|
|
@ -29,6 +29,10 @@ cdef class TransitionSystem:
|
||||||
cdef const Transition* c
|
cdef const Transition* c
|
||||||
cdef readonly int n_moves
|
cdef readonly int n_moves
|
||||||
|
|
||||||
|
cdef int preprocess_gold(self, GoldParse gold) except -1
|
||||||
|
|
||||||
|
cdef Transition lookup_transition(self, object name) except *
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *
|
cdef Transition init_transition(self, int clas, int move, int label) except *
|
||||||
|
|
||||||
cdef Transition best_valid(self, const weight_t* scores, const State* state) except *
|
cdef Transition best_valid(self, const weight_t* scores, const State* state) except *
|
||||||
|
|
|
@ -28,6 +28,12 @@ cdef class TransitionSystem:
|
||||||
self.label_ids['MISSING'] = -1
|
self.label_ids['MISSING'] = -1
|
||||||
self.c = moves
|
self.c = moves
|
||||||
|
|
||||||
|
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user