mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Fix conflict on convert.py
This commit is contained in:
commit
d06f235fc9
|
@ -57,9 +57,9 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||||
# at the beginning of training.
|
# at the beginning of training.
|
||||||
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.5),
|
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
|
||||||
util.env_opt('dropout_to', 0.2),
|
util.env_opt('dropout_to', 0.2),
|
||||||
util.env_opt('dropout_decay', 1e-4))
|
util.env_opt('dropout_decay', 0.0))
|
||||||
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
|
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
|
||||||
util.env_opt('batch_to', 64),
|
util.env_opt('batch_to', 64),
|
||||||
util.env_opt('batch_compound', 1.001))
|
util.env_opt('batch_compound', 1.001))
|
||||||
|
@ -71,10 +71,11 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
|
||||||
|
|
||||||
print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||||
|
try:
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
|
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
|
||||||
train_docs = corpus.train_docs(nlp, projectivize=True,
|
train_docs = corpus.train_docs(nlp, projectivize=True,
|
||||||
gold_preproc=False, shuffle=i)
|
gold_preproc=False, max_length=1000)
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
|
@ -84,8 +85,14 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
|
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
|
scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
|
||||||
|
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
|
||||||
|
dill.dump(nlp, file_, -1)
|
||||||
|
|
||||||
|
|
||||||
print_progress(i, losses, scorer.scores)
|
print_progress(i, losses, scorer.scores)
|
||||||
with (output_path / 'model.bin').open('wb') as file_:
|
finally:
|
||||||
|
print("Saving model...")
|
||||||
|
with (output_path / 'model-final.pickle').open('wb') as file_:
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
dill.dump(nlp, file_, -1)
|
dill.dump(nlp, file_, -1)
|
||||||
|
|
||||||
|
|
|
@ -198,15 +198,15 @@ class GoldCorpus(object):
|
||||||
n += 1
|
n += 1
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def train_docs(self, nlp, shuffle=0, gold_preproc=False,
|
def train_docs(self, nlp, gold_preproc=False,
|
||||||
projectivize=False):
|
projectivize=False, max_length=None):
|
||||||
train_tuples = self.train_tuples
|
train_tuples = self.train_tuples
|
||||||
if projectivize:
|
if projectivize:
|
||||||
train_tuples = nonproj.preprocess_training_data(
|
train_tuples = nonproj.preprocess_training_data(
|
||||||
self.train_tuples)
|
self.train_tuples)
|
||||||
if shuffle:
|
|
||||||
random.shuffle(train_tuples)
|
random.shuffle(train_tuples)
|
||||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc)
|
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||||
|
max_length=max_length)
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
def dev_docs(self, nlp, gold_preproc=False):
|
def dev_docs(self, nlp, gold_preproc=False):
|
||||||
|
@ -215,7 +215,7 @@ class GoldCorpus(object):
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc):
|
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None):
|
||||||
for raw_text, paragraph_tuples in tuples:
|
for raw_text, paragraph_tuples in tuples:
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
raw_text = None
|
raw_text = None
|
||||||
|
@ -226,6 +226,7 @@ class GoldCorpus(object):
|
||||||
gold_preproc)
|
gold_preproc)
|
||||||
golds = cls._make_golds(docs, paragraph_tuples)
|
golds = cls._make_golds(docs, paragraph_tuples)
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
|
if not max_length or len(doc) < max_length:
|
||||||
yield doc, gold
|
yield doc, gold
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -223,8 +223,7 @@ class Language(object):
|
||||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||||
drop=drop, sgd=get_grads, losses=losses)
|
drop=drop, sgd=get_grads, losses=losses)
|
||||||
bp_tokvecses(d_tokvecses, sgd=get_grads)
|
bp_tokvecses(d_tokvecses, sgd=sgd)
|
||||||
break
|
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
# Clear the tensor variable, to free GPU memory.
|
# Clear the tensor variable, to free GPU memory.
|
||||||
|
|
|
@ -345,6 +345,7 @@ cdef cppclass StateC:
|
||||||
this._s_i = src._s_i
|
this._s_i = src._s_i
|
||||||
this._e_i = src._e_i
|
this._e_i = src._e_i
|
||||||
this._break = src._break
|
this._break = src._break
|
||||||
|
this.offset = src.offset
|
||||||
|
|
||||||
void fast_forward() nogil:
|
void fast_forward() nogil:
|
||||||
# space token attachement policy:
|
# space token attachement policy:
|
||||||
|
|
|
@ -350,8 +350,15 @@ cdef class ArcEager(TransitionSystem):
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
|
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
|
||||||
|
|
||||||
|
def has_gold(self, GoldParse gold, start=0, end=None):
|
||||||
|
end = end or len(gold.heads)
|
||||||
|
if all([tag is None for tag in gold.heads[start:end]]):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
def preprocess_gold(self, GoldParse gold):
|
def preprocess_gold(self, GoldParse gold):
|
||||||
if all([h is None for h in gold.heads]):
|
if not self.has_gold(gold):
|
||||||
return None
|
return None
|
||||||
for i in range(gold.length):
|
for i in range(gold.length):
|
||||||
if gold.heads[i] is None: # Missing values
|
if gold.heads[i] is None: # Missing values
|
||||||
|
|
|
@ -95,8 +95,15 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
else:
|
else:
|
||||||
return MOVE_NAMES[move] + '-' + self.strings[label]
|
return MOVE_NAMES[move] + '-' + self.strings[label]
|
||||||
|
|
||||||
|
def has_gold(self, GoldParse gold, start=0, end=None):
|
||||||
|
end = end or len(gold.ner)
|
||||||
|
if all([tag == '-' for tag in gold.ner[start:end]]):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
def preprocess_gold(self, GoldParse gold):
|
def preprocess_gold(self, GoldParse gold):
|
||||||
if all([tag == '-' for tag in gold.ner]):
|
if not self.has_gold(gold):
|
||||||
return None
|
return None
|
||||||
for i in range(gold.length):
|
for i in range(gold.length):
|
||||||
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
|
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
|
||||||
|
|
|
@ -427,8 +427,7 @@ cdef class Parser:
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
|
|
||||||
states, golds = self._init_gold_batch(docs, golds)
|
states, golds, max_length = self._init_gold_batch(docs, golds)
|
||||||
max_length = min([len(doc) for doc in docs])
|
|
||||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
|
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
|
||||||
0.0)
|
0.0)
|
||||||
todo = [(s, g) for (s, g) in zip(states, golds)
|
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||||
|
@ -472,46 +471,36 @@ cdef class Parser:
|
||||||
backprops, sgd, cuda_stream)
|
backprops, sgd, cuda_stream)
|
||||||
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||||
|
|
||||||
def _init_gold_batch(self, docs, golds):
|
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||||
where N is the shortest doc. We'll make two states, one representing
|
where N is the shortest doc. We'll make two states, one representing
|
||||||
long_doc[:N], and another representing long_doc[N:]."""
|
long_doc[:N], and another representing long_doc[N:]."""
|
||||||
cdef StateClass state
|
cdef:
|
||||||
lengths = [len(doc) for doc in docs]
|
StateClass state
|
||||||
min_length = min(lengths)
|
Transition action
|
||||||
offset = 0
|
whole_states = self.moves.init_batch(whole_docs)
|
||||||
|
max_length = max(5, min(20, min([len(doc) for doc in whole_docs])))
|
||||||
states = []
|
states = []
|
||||||
extra_golds = []
|
golds = []
|
||||||
cdef Pool mem = Pool()
|
for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
|
||||||
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
|
||||||
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
|
||||||
for doc, gold in zip(docs, golds):
|
|
||||||
gold = self.moves.preprocess_gold(gold)
|
gold = self.moves.preprocess_gold(gold)
|
||||||
state = StateClass(doc, offset=offset)
|
if gold is None:
|
||||||
self.moves.initialize_state(state.c)
|
continue
|
||||||
if not state.is_final():
|
oracle_actions = self.moves.get_oracle_sequence(doc, gold)
|
||||||
states.append(state)
|
start = 0
|
||||||
extra_golds.append(gold)
|
|
||||||
start = min(min_length, len(doc))
|
|
||||||
while start < len(doc):
|
while start < len(doc):
|
||||||
length = min(min_length, len(doc)-start)
|
state = state.copy()
|
||||||
state = StateClass(doc, offset=offset)
|
|
||||||
self.moves.initialize_state(state.c)
|
|
||||||
while state.B(0) < start and not state.is_final():
|
while state.B(0) < start and not state.is_final():
|
||||||
self.moves.set_costs(is_valid, costs, state, gold)
|
action = self.moves.c[oracle_actions.pop(0)]
|
||||||
for i in range(self.moves.n_moves):
|
action.do(state.c, action.label)
|
||||||
if is_valid[i] and costs[i] <= 0:
|
has_gold = self.moves.has_gold(gold, start=start,
|
||||||
self.moves.c[i].do(state.c, self.moves.c[i].label)
|
end=start+max_length)
|
||||||
break
|
if not state.is_final() and has_gold:
|
||||||
else:
|
|
||||||
raise ValueError("Could not find gold move")
|
|
||||||
start += length
|
|
||||||
if not state.is_final():
|
|
||||||
states.append(state)
|
states.append(state)
|
||||||
extra_golds.append(gold)
|
golds.append(gold)
|
||||||
offset += len(doc)
|
start += min(max_length, len(doc)-start)
|
||||||
return states, extra_golds
|
return states, golds, max_length
|
||||||
|
|
||||||
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
|
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
|
||||||
# Tells CUDA to block, so our async copies complete.
|
# Tells CUDA to block, so our async copies complete.
|
||||||
|
|
|
@ -41,6 +41,11 @@ cdef class StateClass:
|
||||||
def is_final(self):
|
def is_final(self):
|
||||||
return self.c.is_final()
|
return self.c.is_final()
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length)
|
||||||
|
new_state.c.clone(self.c)
|
||||||
|
return new_state
|
||||||
|
|
||||||
def print_state(self, words):
|
def print_state(self, words):
|
||||||
words = list(words) + ['_']
|
words = list(words) + ['_']
|
||||||
top = words[self.S(0)] + '_%d' % self.S_(0).head
|
top = words[self.S(0)] + '_%d' % self.S_(0).head
|
||||||
|
|
|
@ -61,6 +61,24 @@ cdef class TransitionSystem:
|
||||||
offset += len(doc)
|
offset += len(doc)
|
||||||
return states
|
return states
|
||||||
|
|
||||||
|
def get_oracle_sequence(self, doc, GoldParse gold):
|
||||||
|
cdef Pool mem = Pool()
|
||||||
|
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
|
||||||
|
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
|
||||||
|
|
||||||
|
cdef StateClass state = StateClass(doc, offset=0)
|
||||||
|
self.initialize_state(state.c)
|
||||||
|
history = []
|
||||||
|
while not state.is_final():
|
||||||
|
self.set_costs(is_valid, costs, state, gold)
|
||||||
|
for i in range(self.n_moves):
|
||||||
|
if is_valid[i] and costs[i] <= 0:
|
||||||
|
action = self.c[i]
|
||||||
|
history.append(i)
|
||||||
|
action.do(state.c, action.label)
|
||||||
|
break
|
||||||
|
return history
|
||||||
|
|
||||||
cdef int initialize_state(self, StateC* state) nogil:
|
cdef int initialize_state(self, StateC* state) nogil:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -92,11 +110,21 @@ cdef class TransitionSystem:
|
||||||
StateClass stcls, GoldParse gold) except -1:
|
StateClass stcls, GoldParse gold) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
self.set_valid(is_valid, stcls.c)
|
self.set_valid(is_valid, stcls.c)
|
||||||
|
cdef int n_gold = 0
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if is_valid[i]:
|
if is_valid[i]:
|
||||||
costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
|
costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
|
||||||
|
n_gold += costs[i] <= 0
|
||||||
else:
|
else:
|
||||||
costs[i] = 9000
|
costs[i] = 9000
|
||||||
|
if n_gold <= 0:
|
||||||
|
print(gold.words)
|
||||||
|
print(gold.ner)
|
||||||
|
raise ValueError(
|
||||||
|
"Could not find a gold-standard action to supervise "
|
||||||
|
"the entity recognizer\n"
|
||||||
|
"The transition system has %d actions.\n"
|
||||||
|
"%s" % (self.n_moves))
|
||||||
|
|
||||||
def add_action(self, int action, label):
|
def add_action(self, int action, label):
|
||||||
if not isinstance(label, int):
|
if not isinstance(label, int):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user