mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 00:32:40 +03:00
* Temporarily disable NER, and wire up the verbose flag during training
This commit is contained in:
parent
763cbd23d5
commit
c500d72dc2
|
@ -48,7 +48,7 @@ def add_noise(orig, noise_level):
|
||||||
return ''.join(_corrupt(c, noise_level) for c in orig)
|
return ''.join(_corrupt(c, noise_level) for c in orig)
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, raw_text, annot_tuples):
|
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
||||||
if raw_text is None:
|
if raw_text is None:
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||||
else:
|
else:
|
||||||
|
@ -57,7 +57,7 @@ def score_model(scorer, nlp, raw_text, annot_tuples):
|
||||||
nlp.entity(tokens)
|
nlp.entity(tokens)
|
||||||
nlp.parser(tokens)
|
nlp.parser(tokens)
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
gold = GoldParse(tokens, annot_tuples)
|
||||||
scorer.score(tokens, gold, verbose=False)
|
scorer.score(tokens, gold, verbose=verbose)
|
||||||
|
|
||||||
|
|
||||||
def _merge_sents(sents):
|
def _merge_sents(sents):
|
||||||
|
@ -78,7 +78,7 @@ def _merge_sents(sents):
|
||||||
|
|
||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
|
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
|
||||||
beam_width=1):
|
beam_width=1, verbose=False):
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
dep_model_dir = path.join(model_dir, 'deps')
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
pos_model_dir = path.join(model_dir, 'pos')
|
||||||
ner_model_dir = path.join(model_dir, 'ner')
|
ner_model_dir = path.join(model_dir, 'ner')
|
||||||
|
@ -118,7 +118,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
for annot_tuples, ctnt in sents:
|
for annot_tuples, ctnt in sents:
|
||||||
if len(annot_tuples[1]) == 1:
|
if len(annot_tuples[1]) == 1:
|
||||||
continue
|
continue
|
||||||
score_model(scorer, nlp, raw_text, annot_tuples)
|
score_model(scorer, nlp, raw_text, annot_tuples,
|
||||||
|
verbose=verbose if itn >= 2 else False)
|
||||||
if raw_text is None:
|
if raw_text is None:
|
||||||
words = add_noise(annot_tuples[1], corruption_level)
|
words = add_noise(annot_tuples[1], corruption_level)
|
||||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||||
|
@ -129,7 +130,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
||||||
loss += nlp.parser.train(tokens, gold)
|
loss += nlp.parser.train(tokens, gold)
|
||||||
|
|
||||||
nlp.entity.train(tokens, gold)
|
#nlp.entity.train(tokens, gold)
|
||||||
nlp.tagger.train(tokens, gold.tags)
|
nlp.tagger.train(tokens, gold.tags)
|
||||||
random.shuffle(gold_tuples)
|
random.shuffle(gold_tuples)
|
||||||
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||||
|
@ -156,7 +157,7 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False
|
||||||
if raw_text is None:
|
if raw_text is None:
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||||
nlp.tagger(tokens)
|
nlp.tagger(tokens)
|
||||||
nlp.entity(tokens)
|
#nlp.entity(tokens)
|
||||||
nlp.parser(tokens)
|
nlp.parser(tokens)
|
||||||
else:
|
else:
|
||||||
tokens = nlp(raw_text, merge_mwes=False)
|
tokens = nlp(raw_text, merge_mwes=False)
|
||||||
|
@ -178,7 +179,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
|
||||||
if raw_text is None:
|
if raw_text is None:
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||||
nlp.tagger(tokens)
|
nlp.tagger(tokens)
|
||||||
nlp.entity(tokens)
|
#nlp.entity(tokens)
|
||||||
nlp.parser(tokens)
|
nlp.parser(tokens)
|
||||||
else:
|
else:
|
||||||
tokens = nlp(raw_text, merge_mwes=False)
|
tokens = nlp(raw_text, merge_mwes=False)
|
||||||
|
@ -214,9 +215,9 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
|
||||||
feat_set='basic' if not debug else 'debug',
|
feat_set='basic' if not debug else 'debug',
|
||||||
gold_preproc=gold_preproc, n_sents=n_sents,
|
gold_preproc=gold_preproc, n_sents=n_sents,
|
||||||
corruption_level=corruption_level, n_iter=n_iter,
|
corruption_level=corruption_level, n_iter=n_iter,
|
||||||
beam_width=beam_width)
|
beam_width=beam_width, verbose=verbose)
|
||||||
if out_loc:
|
#if out_loc:
|
||||||
write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
||||||
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
||||||
model_dir, gold_preproc=gold_preproc, verbose=verbose,
|
model_dir, gold_preproc=gold_preproc, verbose=verbose,
|
||||||
beam_width=beam_width)
|
beam_width=beam_width)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user