Fix conflict on convert.py

This commit is contained in:
Matthew Honnibal 2017-05-26 11:33:29 -05:00
commit d06f235fc9
9 changed files with 106 additions and 62 deletions

View File

@ -57,9 +57,9 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
# starts high and decays sharply, to force the optimizer to explore. # starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly # Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training. # at the beginning of training.
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.5), dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
util.env_opt('dropout_to', 0.2), util.env_opt('dropout_to', 0.2),
util.env_opt('dropout_decay', 1e-4)) util.env_opt('dropout_decay', 0.0))
batch_sizes = util.compounding(util.env_opt('batch_from', 1), batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 64), util.env_opt('batch_to', 64),
util.env_opt('batch_compound', 1.001)) util.env_opt('batch_compound', 1.001))
@ -71,23 +71,30 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu) optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
for i in range(n_iter): try:
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: for i in range(n_iter):
train_docs = corpus.train_docs(nlp, projectivize=True, with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
gold_preproc=False, shuffle=i) train_docs = corpus.train_docs(nlp, projectivize=True,
losses = {} gold_preproc=False, max_length=1000)
for batch in minibatch(train_docs, size=batch_sizes): losses = {}
docs, golds = zip(*batch) for batch in minibatch(train_docs, size=batch_sizes):
nlp.update(docs, golds, sgd=optimizer, docs, golds = zip(*batch)
drop=next(dropout_rates), losses=losses) nlp.update(docs, golds, sgd=optimizer,
pbar.update(len(docs)) drop=next(dropout_rates), losses=losses)
pbar.update(len(docs))
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False)) scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
print_progress(i, losses, scorer.scores) with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
with (output_path / 'model.bin').open('wb') as file_: dill.dump(nlp, file_, -1)
with nlp.use_params(optimizer.averages):
dill.dump(nlp, file_, -1)
print_progress(i, losses, scorer.scores)
finally:
print("Saving model...")
with (output_path / 'model-final.pickle').open('wb') as file_:
with nlp.use_params(optimizer.averages):
dill.dump(nlp, file_, -1)
def _render_parses(i, to_render): def _render_parses(i, to_render):

View File

@ -198,15 +198,15 @@ class GoldCorpus(object):
n += 1 n += 1
return n return n
def train_docs(self, nlp, shuffle=0, gold_preproc=False, def train_docs(self, nlp, gold_preproc=False,
projectivize=False): projectivize=False, max_length=None):
train_tuples = self.train_tuples train_tuples = self.train_tuples
if projectivize: if projectivize:
train_tuples = nonproj.preprocess_training_data( train_tuples = nonproj.preprocess_training_data(
self.train_tuples) self.train_tuples)
if shuffle: random.shuffle(train_tuples)
random.shuffle(train_tuples) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) max_length=max_length)
yield from gold_docs yield from gold_docs
def dev_docs(self, nlp, gold_preproc=False): def dev_docs(self, nlp, gold_preproc=False):
@ -215,7 +215,7 @@ class GoldCorpus(object):
yield from gold_docs yield from gold_docs
@classmethod @classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc): def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None):
for raw_text, paragraph_tuples in tuples: for raw_text, paragraph_tuples in tuples:
if gold_preproc: if gold_preproc:
raw_text = None raw_text = None
@ -226,7 +226,8 @@ class GoldCorpus(object):
gold_preproc) gold_preproc)
golds = cls._make_golds(docs, paragraph_tuples) golds = cls._make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
yield doc, gold if not max_length or len(doc) < max_length:
yield doc, gold
@classmethod @classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc): def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc):

View File

@ -223,8 +223,7 @@ class Language(object):
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
d_tokvecses = proc.update((docs, tokvecses), golds, d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses) drop=drop, sgd=get_grads, losses=losses)
bp_tokvecses(d_tokvecses, sgd=get_grads) bp_tokvecses(d_tokvecses, sgd=sgd)
break
for key, (W, dW) in grads.items(): for key, (W, dW) in grads.items():
sgd(W, dW, key=key) sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory. # Clear the tensor variable, to free GPU memory.

View File

@ -345,6 +345,7 @@ cdef cppclass StateC:
this._s_i = src._s_i this._s_i = src._s_i
this._e_i = src._e_i this._e_i = src._e_i
this._break = src._break this._break = src._break
this.offset = src.offset
void fast_forward() nogil: void fast_forward() nogil:
# space token attachement policy: # space token attachement policy:

View File

@ -350,8 +350,15 @@ cdef class ArcEager(TransitionSystem):
def __get__(self): def __get__(self):
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.heads)
if all([tag is None for tag in gold.heads[start:end]]):
return False
else:
return True
def preprocess_gold(self, GoldParse gold): def preprocess_gold(self, GoldParse gold):
if all([h is None for h in gold.heads]): if not self.has_gold(gold):
return None return None
for i in range(gold.length): for i in range(gold.length):
if gold.heads[i] is None: # Missing values if gold.heads[i] is None: # Missing values

View File

@ -95,8 +95,15 @@ cdef class BiluoPushDown(TransitionSystem):
else: else:
return MOVE_NAMES[move] + '-' + self.strings[label] return MOVE_NAMES[move] + '-' + self.strings[label]
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.ner)
if all([tag == '-' for tag in gold.ner[start:end]]):
return False
else:
return True
def preprocess_gold(self, GoldParse gold): def preprocess_gold(self, GoldParse gold):
if all([tag == '-' for tag in gold.ner]): if not self.has_gold(gold):
return None return None
for i in range(gold.length): for i in range(gold.length):
gold.c.ner[i] = self.lookup_transition(gold.ner[i]) gold.c.ner[i] = self.lookup_transition(gold.ner[i])

View File

@ -427,8 +427,7 @@ cdef class Parser:
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
states, golds = self._init_gold_batch(docs, golds) states, golds, max_length = self._init_gold_batch(docs, golds)
max_length = min([len(doc) for doc in docs])
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
0.0) 0.0)
todo = [(s, g) for (s, g) in zip(states, golds) todo = [(s, g) for (s, g) in zip(states, golds)
@ -472,46 +471,36 @@ cdef class Parser:
backprops, sgd, cuda_stream) backprops, sgd, cuda_stream)
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
def _init_gold_batch(self, docs, golds): def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long """Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N, doc will get multiple states. Let's say we have a doc of length 2*N,
where N is the shortest doc. We'll make two states, one representing where N is the shortest doc. We'll make two states, one representing
long_doc[:N], and another representing long_doc[N:].""" long_doc[:N], and another representing long_doc[N:]."""
cdef StateClass state cdef:
lengths = [len(doc) for doc in docs] StateClass state
min_length = min(lengths) Transition action
offset = 0 whole_states = self.moves.init_batch(whole_docs)
max_length = max(5, min(20, min([len(doc) for doc in whole_docs])))
states = [] states = []
extra_golds = [] golds = []
cdef Pool mem = Pool() for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
for doc, gold in zip(docs, golds):
gold = self.moves.preprocess_gold(gold) gold = self.moves.preprocess_gold(gold)
state = StateClass(doc, offset=offset) if gold is None:
self.moves.initialize_state(state.c) continue
if not state.is_final(): oracle_actions = self.moves.get_oracle_sequence(doc, gold)
states.append(state) start = 0
extra_golds.append(gold)
start = min(min_length, len(doc))
while start < len(doc): while start < len(doc):
length = min(min_length, len(doc)-start) state = state.copy()
state = StateClass(doc, offset=offset)
self.moves.initialize_state(state.c)
while state.B(0) < start and not state.is_final(): while state.B(0) < start and not state.is_final():
self.moves.set_costs(is_valid, costs, state, gold) action = self.moves.c[oracle_actions.pop(0)]
for i in range(self.moves.n_moves): action.do(state.c, action.label)
if is_valid[i] and costs[i] <= 0: has_gold = self.moves.has_gold(gold, start=start,
self.moves.c[i].do(state.c, self.moves.c[i].label) end=start+max_length)
break if not state.is_final() and has_gold:
else:
raise ValueError("Could not find gold move")
start += length
if not state.is_final():
states.append(state) states.append(state)
extra_golds.append(gold) golds.append(gold)
offset += len(doc) start += min(max_length, len(doc)-start)
return states, extra_golds return states, golds, max_length
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
# Tells CUDA to block, so our async copies complete. # Tells CUDA to block, so our async copies complete.

View File

@ -41,6 +41,11 @@ cdef class StateClass:
def is_final(self): def is_final(self):
return self.c.is_final() return self.c.is_final()
def copy(self):
cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length)
new_state.c.clone(self.c)
return new_state
def print_state(self, words): def print_state(self, words):
words = list(words) + ['_'] words = list(words) + ['_']
top = words[self.S(0)] + '_%d' % self.S_(0).head top = words[self.S(0)] + '_%d' % self.S_(0).head

View File

@ -61,6 +61,24 @@ cdef class TransitionSystem:
offset += len(doc) offset += len(doc)
return states return states
def get_oracle_sequence(self, doc, GoldParse gold):
cdef Pool mem = Pool()
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
cdef StateClass state = StateClass(doc, offset=0)
self.initialize_state(state.c)
history = []
while not state.is_final():
self.set_costs(is_valid, costs, state, gold)
for i in range(self.n_moves):
if is_valid[i] and costs[i] <= 0:
action = self.c[i]
history.append(i)
action.do(state.c, action.label)
break
return history
cdef int initialize_state(self, StateC* state) nogil: cdef int initialize_state(self, StateC* state) nogil:
pass pass
@ -92,11 +110,21 @@ cdef class TransitionSystem:
StateClass stcls, GoldParse gold) except -1: StateClass stcls, GoldParse gold) except -1:
cdef int i cdef int i
self.set_valid(is_valid, stcls.c) self.set_valid(is_valid, stcls.c)
cdef int n_gold = 0
for i in range(self.n_moves): for i in range(self.n_moves):
if is_valid[i]: if is_valid[i]:
costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
n_gold += costs[i] <= 0
else: else:
costs[i] = 9000 costs[i] = 9000
if n_gold <= 0:
print(gold.words)
print(gold.ner)
raise ValueError(
"Could not find a gold-standard action to supervise "
"the entity recognizer\n"
"The transition system has %d actions.\n"
"%s" % (self.n_moves))
def add_action(self, int action, label): def add_action(self, int action, label):
if not isinstance(label, int): if not isinstance(label, int):