Move timing into Language.evaluate (#5836)

Move timing into `Language.evaluate` so that only the processing is
timing, not processing + scoring. `Language.evaluate` returns
`scores["speed"]` as words per second, which should be identical to how
the speed was added to the scores previously. Also add the speed to the
evaluate CLI output.
This commit is contained in:
Adriane Boyd 2020-07-29 11:02:31 +02:00 committed by GitHub
parent 256b24b720
commit 0cddb0dbe9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 26 additions and 15 deletions

View File

@ -67,10 +67,7 @@ def evaluate(
corpus = Corpus(data_path, data_path) corpus = Corpus(data_path, data_path)
nlp = util.load_model(model) nlp = util.load_model(model)
dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
begin = timer()
scores = nlp.evaluate(dev_dataset, verbose=False) scores = nlp.evaluate(dev_dataset, verbose=False)
end = timer()
nwords = sum(len(ex.predicted) for ex in dev_dataset)
metrics = { metrics = {
"TOK": "token_acc", "TOK": "token_acc",
"TAG": "tag_acc", "TAG": "tag_acc",
@ -82,16 +79,20 @@ def evaluate(
"NER P": "ents_p", "NER P": "ents_p",
"NER R": "ents_r", "NER R": "ents_r",
"NER F": "ents_f", "NER F": "ents_f",
"Textcat": "cats_score", "TEXTCAT": "cats_score",
"Sent P": "sents_p", "SENT P": "sents_p",
"Sent R": "sents_r", "SENT R": "sents_r",
"Sent F": "sents_f", "SENT F": "sents_f",
"SPEED": "speed",
} }
results = {} results = {}
for metric, key in metrics.items(): for metric, key in metrics.items():
if key in scores: if key in scores:
if key == "cats_score": if key == "cats_score":
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
if key == "speed":
results[metric] = f"{scores[key]:.0f}"
else:
results[metric] = f"{scores[key]*100:.2f}" results[metric] = f"{scores[key]*100:.2f}"
data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}

View File

@ -1,5 +1,4 @@
from typing import Optional, Dict, Any, Tuple, Union, Callable, List from typing import Optional, Dict, Any, Tuple, Union, Callable, List
from timeit import default_timer as timer
import srsly import srsly
import tqdm import tqdm
from pathlib import Path from pathlib import Path
@ -248,14 +247,11 @@ def create_evaluation_callback(
dev_examples = list(dev_examples) dev_examples = list(dev_examples)
n_words = sum(len(ex.predicted) for ex in dev_examples) n_words = sum(len(ex.predicted) for ex in dev_examples)
batch_size = cfg["eval_batch_size"] batch_size = cfg["eval_batch_size"]
start_time = timer()
if optimizer.averages: if optimizer.averages:
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
scores = nlp.evaluate(dev_examples, batch_size=batch_size) scores = nlp.evaluate(dev_examples, batch_size=batch_size)
else: else:
scores = nlp.evaluate(dev_examples, batch_size=batch_size) scores = nlp.evaluate(dev_examples, batch_size=batch_size)
end_time = timer()
wps = n_words / (end_time - start_time)
# Calculate a weighted sum based on score_weights for the main score # Calculate a weighted sum based on score_weights for the main score
weights = cfg["score_weights"] weights = cfg["score_weights"]
try: try:
@ -264,7 +260,6 @@ def create_evaluation_callback(
keys = list(scores.keys()) keys = list(scores.keys())
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
raise KeyError(err) raise KeyError(err)
scores["speed"] = wps
return weighted_score, scores return weighted_score, scores
return evaluate return evaluate

View File

@ -14,6 +14,7 @@ from thinc.api import get_current_ops, Config, require_gpu, Optimizer
import srsly import srsly
import multiprocessing as mp import multiprocessing as mp
from itertools import chain, cycle from itertools import chain, cycle
from timeit import default_timer as timer
from .tokens.underscore import Underscore from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab from .vocab import Vocab, create_vocab
@ -1088,7 +1089,14 @@ class Language:
kwargs.setdefault("verbose", verbose) kwargs.setdefault("verbose", verbose)
kwargs.setdefault("nlp", self) kwargs.setdefault("nlp", self)
scorer = Scorer(**kwargs) scorer = Scorer(**kwargs)
docs = list(eg.predicted for eg in examples) texts = [eg.reference.text for eg in examples]
docs = [eg.predicted for eg in examples]
start_time = timer()
# tokenize the texts only for timing purposes
if not hasattr(self.tokenizer, "pipe"):
_ = [self.tokenizer(text) for text in texts]
else:
_ = list(self.tokenizer.pipe(texts))
for name, pipe in self.pipeline: for name, pipe in self.pipeline:
kwargs = component_cfg.get(name, {}) kwargs = component_cfg.get(name, {})
kwargs.setdefault("batch_size", batch_size) kwargs.setdefault("batch_size", batch_size)
@ -1096,11 +1104,18 @@ class Language:
docs = _pipe(docs, pipe, kwargs) docs = _pipe(docs, pipe, kwargs)
else: else:
docs = pipe.pipe(docs, **kwargs) docs = pipe.pipe(docs, **kwargs)
# iterate over the final generator
if len(self.pipeline):
docs = list(docs)
end_time = timer()
for i, (doc, eg) in enumerate(zip(docs, examples)): for i, (doc, eg) in enumerate(zip(docs, examples)):
if verbose: if verbose:
print(doc) print(doc)
eg.predicted = doc eg.predicted = doc
return scorer.score(examples) results = scorer.score(examples)
n_words = sum(len(eg.predicted) for eg in examples)
results["speed"] = n_words / (end_time - start_time)
return results
@contextmanager @contextmanager
def use_params(self, params: dict): def use_params(self, params: dict):