mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
* Merge train.py
This commit is contained in:
parent
2e3dc3dfe2
commit
3a302ae6f2
|
@ -61,8 +61,8 @@ def read_docparse_gold(file_):
|
||||||
tags = []
|
tags = []
|
||||||
ids = []
|
ids = []
|
||||||
lines = sent_str.strip().split('\n')
|
lines = sent_str.strip().split('\n')
|
||||||
raw_text = lines.pop(0)
|
raw_text = lines.pop(0).strip()
|
||||||
tok_text = lines.pop(0)
|
tok_text = lines.pop(0).strip()
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
id_, word, pos_string, head_idx, label = _parse_line(line)
|
id_, word, pos_string, head_idx, label = _parse_line(line)
|
||||||
if label == 'root':
|
if label == 'root':
|
||||||
|
@ -234,6 +234,7 @@ def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
|
||||||
skipped = 0
|
skipped = 0
|
||||||
loss = 0
|
loss = 0
|
||||||
with codecs.open(dev_loc, 'r', 'utf8') as file_:
|
with codecs.open(dev_loc, 'r', 'utf8') as file_:
|
||||||
|
#paragraphs = read_tokenized_gold(file_)
|
||||||
paragraphs = read_docparse_gold(file_)
|
paragraphs = read_docparse_gold(file_)
|
||||||
for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer,
|
for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer,
|
||||||
gold_preproc=gold_preproc):
|
gold_preproc=gold_preproc):
|
||||||
|
@ -241,11 +242,7 @@ def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
|
||||||
nlp.tagger(tokens)
|
nlp.tagger(tokens)
|
||||||
nlp.parser(tokens)
|
nlp.parser(tokens)
|
||||||
for i, token in enumerate(tokens):
|
for i, token in enumerate(tokens):
|
||||||
try:
|
|
||||||
pos_corr += token.tag_ == tag_strs[i]
|
pos_corr += token.tag_ == tag_strs[i]
|
||||||
except:
|
|
||||||
print i, token.orth_, token.tag
|
|
||||||
raise
|
|
||||||
n_tokens += 1
|
n_tokens += 1
|
||||||
if heads[i] is None:
|
if heads[i] is None:
|
||||||
skipped += 1
|
skipped += 1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user