diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 83281543a..ee1be57a3 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -67,10 +67,7 @@ def evaluate( corpus = Corpus(data_path, data_path) nlp = util.load_model(model) dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) - begin = timer() scores = nlp.evaluate(dev_dataset, verbose=False) - end = timer() - nwords = sum(len(ex.predicted) for ex in dev_dataset) metrics = { "TOK": "token_acc", "TAG": "tag_acc", @@ -82,17 +79,21 @@ def evaluate( "NER P": "ents_p", "NER R": "ents_r", "NER F": "ents_f", - "Textcat": "cats_score", - "Sent P": "sents_p", - "Sent R": "sents_r", - "Sent F": "sents_f", + "TEXTCAT": "cats_score", + "SENT P": "sents_p", + "SENT R": "sents_r", + "SENT F": "sents_f", + "SPEED": "speed", } results = {} for metric, key in metrics.items(): if key in scores: if key == "cats_score": metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" - results[metric] = f"{scores[key]*100:.2f}" + if key == "speed": + results[metric] = f"{scores[key]:.0f}" + else: + results[metric] = f"{scores[key]*100:.2f}" data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} msg.table(results, title="Results") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 44597c73e..25eb4a3c0 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,5 +1,4 @@ from typing import Optional, Dict, Any, Tuple, Union, Callable, List -from timeit import default_timer as timer import srsly import tqdm from pathlib import Path @@ -248,14 +247,11 @@ def create_evaluation_callback( dev_examples = list(dev_examples) n_words = sum(len(ex.predicted) for ex in dev_examples) batch_size = cfg["eval_batch_size"] - start_time = timer() if optimizer.averages: with nlp.use_params(optimizer.averages): scores = nlp.evaluate(dev_examples, batch_size=batch_size) else: scores = nlp.evaluate(dev_examples, batch_size=batch_size) - end_time = timer() - wps = n_words / (end_time - start_time) # Calculate a weighted sum based on score_weights for the main score weights = cfg["score_weights"] try: @@ -264,7 +260,6 @@ def create_evaluation_callback( keys = list(scores.keys()) err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) raise KeyError(err) - scores["speed"] = wps return weighted_score, scores return evaluate diff --git a/spacy/language.py b/spacy/language.py index 79fceec95..fe0a86ed1 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -14,6 +14,7 @@ from thinc.api import get_current_ops, Config, require_gpu, Optimizer import srsly import multiprocessing as mp from itertools import chain, cycle +from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab @@ -1088,7 +1089,14 @@ class Language: kwargs.setdefault("verbose", verbose) kwargs.setdefault("nlp", self) scorer = Scorer(**kwargs) - docs = list(eg.predicted for eg in examples) + texts = [eg.reference.text for eg in examples] + docs = [eg.predicted for eg in examples] + start_time = timer() + # tokenize the texts only for timing purposes + if not hasattr(self.tokenizer, "pipe"): + _ = [self.tokenizer(text) for text in texts] + else: + _ = list(self.tokenizer.pipe(texts)) for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) @@ -1096,11 +1104,18 @@ class Language: docs = _pipe(docs, pipe, kwargs) else: docs = pipe.pipe(docs, **kwargs) + # iterate over the final generator + if len(self.pipeline): + docs = list(docs) + end_time = timer() for i, (doc, eg) in enumerate(zip(docs, examples)): if verbose: print(doc) eg.predicted = doc - return scorer.score(examples) + results = scorer.score(examples) + n_words = sum(len(eg.predicted) for eg in examples) + results["speed"] = n_words / (end_time - start_time) + return results @contextmanager def use_params(self, params: dict):