mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
a4b32b9552
* Handle missing reference values in scorer Handle missing values in reference doc during scoring where it is possible to detect an unset state for the attribute. If no reference docs contain annotation, `None` is returned instead of a score. `spacy evaluate` displays `-` for missing scores and the missing scores are saved as `None`/`null` in the metrics. Attributes without unset states: * `token.head`: relies on `token.dep` to recognize unset values * `doc.cats`: unable to handle missing annotation Additional changes: * add optional `has_annotation` check to `score_scans` to replace `doc.sents` hack * update `score_token_attr_per_feat` to handle missing and empty morph representations * fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START` vs. `SENT_START` * Fix import * Update return types
197 lines
7.0 KiB
Python
197 lines
7.0 KiB
Python
from typing import Optional, List, Dict
|
|
from wasabi import Printer
|
|
from pathlib import Path
|
|
import re
|
|
import srsly
|
|
from thinc.api import fix_random_seed
|
|
|
|
from ..training import Corpus
|
|
from ..tokens import Doc
|
|
from ._util import app, Arg, Opt, setup_gpu, import_code
|
|
from ..scorer import Scorer
|
|
from .. import util
|
|
from .. import displacy
|
|
|
|
|
|
@app.command("evaluate")
|
|
def evaluate_cli(
|
|
# fmt: off
|
|
model: str = Arg(..., help="Model name or path"),
|
|
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
|
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
|
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
|
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
|
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
|
# fmt: on
|
|
):
|
|
"""
|
|
Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation
|
|
data in the binary .spacy format. The --gold-preproc option sets up the
|
|
evaluation examples with gold-standard sentences and tokens for the
|
|
predictions. Gold preprocessing helps the annotations align to the
|
|
tokenization, and may result in sequences of more consistent length. However,
|
|
it may reduce runtime accuracy due to train/test skew. To render a sample of
|
|
dependency parses in a HTML file, set as output directory as the
|
|
displacy_path argument.
|
|
|
|
DOCS: https://nightly.spacy.io/api/cli#evaluate
|
|
"""
|
|
import_code(code_path)
|
|
evaluate(
|
|
model,
|
|
data_path,
|
|
output=output,
|
|
use_gpu=use_gpu,
|
|
gold_preproc=gold_preproc,
|
|
displacy_path=displacy_path,
|
|
displacy_limit=displacy_limit,
|
|
silent=False,
|
|
)
|
|
|
|
|
|
def evaluate(
|
|
model: str,
|
|
data_path: Path,
|
|
output: Optional[Path] = None,
|
|
use_gpu: int = -1,
|
|
gold_preproc: bool = False,
|
|
displacy_path: Optional[Path] = None,
|
|
displacy_limit: int = 25,
|
|
silent: bool = True,
|
|
) -> Scorer:
|
|
msg = Printer(no_print=silent, pretty=not silent)
|
|
fix_random_seed()
|
|
setup_gpu(use_gpu)
|
|
data_path = util.ensure_path(data_path)
|
|
output_path = util.ensure_path(output)
|
|
displacy_path = util.ensure_path(displacy_path)
|
|
if not data_path.exists():
|
|
msg.fail("Evaluation data not found", data_path, exits=1)
|
|
if displacy_path and not displacy_path.exists():
|
|
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
|
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
|
nlp = util.load_model(model)
|
|
dev_dataset = list(corpus(nlp))
|
|
scores = nlp.evaluate(dev_dataset)
|
|
metrics = {
|
|
"TOK": "token_acc",
|
|
"TAG": "tag_acc",
|
|
"POS": "pos_acc",
|
|
"MORPH": "morph_acc",
|
|
"LEMMA": "lemma_acc",
|
|
"UAS": "dep_uas",
|
|
"LAS": "dep_las",
|
|
"NER P": "ents_p",
|
|
"NER R": "ents_r",
|
|
"NER F": "ents_f",
|
|
"TEXTCAT": "cats_score",
|
|
"SENT P": "sents_p",
|
|
"SENT R": "sents_r",
|
|
"SENT F": "sents_f",
|
|
"SPEED": "speed",
|
|
}
|
|
results = {}
|
|
data = {}
|
|
for metric, key in metrics.items():
|
|
if key in scores:
|
|
if key == "cats_score":
|
|
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
|
if isinstance(scores[key], (int, float)):
|
|
if key == "speed":
|
|
results[metric] = f"{scores[key]:.0f}"
|
|
else:
|
|
results[metric] = f"{scores[key]*100:.2f}"
|
|
else:
|
|
results[metric] = "-"
|
|
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
|
|
|
msg.table(results, title="Results")
|
|
|
|
if "morph_per_feat" in scores:
|
|
if scores["morph_per_feat"]:
|
|
print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
|
|
data["morph_per_feat"] = scores["morph_per_feat"]
|
|
if "dep_las_per_type" in scores:
|
|
if scores["dep_las_per_type"]:
|
|
print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
|
|
data["dep_las_per_type"] = scores["dep_las_per_type"]
|
|
if "ents_per_type" in scores:
|
|
if scores["ents_per_type"]:
|
|
print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
|
|
data["ents_per_type"] = scores["ents_per_type"]
|
|
if "cats_f_per_type" in scores:
|
|
if scores["cats_f_per_type"]:
|
|
print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
|
|
data["cats_f_per_type"] = scores["cats_f_per_type"]
|
|
if "cats_auc_per_type" in scores:
|
|
if scores["cats_auc_per_type"]:
|
|
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
|
|
data["cats_auc_per_type"] = scores["cats_auc_per_type"]
|
|
|
|
if displacy_path:
|
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
|
docs = [ex.predicted for ex in dev_dataset]
|
|
render_deps = "parser" in factory_names
|
|
render_ents = "ner" in factory_names
|
|
render_parses(
|
|
docs,
|
|
displacy_path,
|
|
model_name=model,
|
|
limit=displacy_limit,
|
|
deps=render_deps,
|
|
ents=render_ents,
|
|
)
|
|
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
|
|
|
if output_path is not None:
|
|
srsly.write_json(output_path, data)
|
|
msg.good(f"Saved results to {output_path}")
|
|
return data
|
|
|
|
|
|
def render_parses(
|
|
docs: List[Doc],
|
|
output_path: Path,
|
|
model_name: str = "",
|
|
limit: int = 250,
|
|
deps: bool = True,
|
|
ents: bool = True,
|
|
):
|
|
docs[0].user_data["title"] = model_name
|
|
if ents:
|
|
html = displacy.render(docs[:limit], style="ent", page=True)
|
|
with (output_path / "entities.html").open("w", encoding="utf8") as file_:
|
|
file_.write(html)
|
|
if deps:
|
|
html = displacy.render(
|
|
docs[:limit], style="dep", page=True, options={"compact": True}
|
|
)
|
|
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
|
file_.write(html)
|
|
|
|
|
|
def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None:
|
|
data = [
|
|
(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
|
|
for k, v in scores.items()
|
|
]
|
|
msg.table(
|
|
data,
|
|
header=("", "P", "R", "F"),
|
|
aligns=("l", "r", "r", "r"),
|
|
title=f"{name} (per {type})",
|
|
)
|
|
|
|
|
|
def print_textcats_auc_per_cat(
|
|
msg: Printer, scores: Dict[str, Dict[str, float]]
|
|
) -> None:
|
|
msg.table(
|
|
[(k, f"{v:.2f}") for k, v in scores.items()],
|
|
header=("", "ROC AUC"),
|
|
aligns=("l", "r"),
|
|
title="Textcat ROC AUC (per label)",
|
|
)
|