Add spans in spacy benchmark (#12575)

* Add spans in spacy benchmark

The current implementation of spaCy benchmark accuracy / spacy evaluate
doesn't include the "spans" type, so calling the command doesn't render
the HTML displaCy file needed.

This PR attempts to fix that by creating a new parameter for "spans"
and calling the appropriate displaCy value.

* Reformat file with black

* Add tests for evaluate

* Fix spans -> span for displacy style

* Update test to check render instead

* Update source so mypy passes

* Add parser information to avoid warnings
This commit is contained in:
Lj Miranda 2023-04-28 20:32:52 +08:00 committed by Adriane Boyd
parent 139368d9ce
commit 9ec12fcfde
2 changed files with 75 additions and 0 deletions

View File

@ -122,6 +122,8 @@ def evaluate(
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
render_deps = "parser" in factory_names render_deps = "parser" in factory_names
render_ents = "ner" in factory_names render_ents = "ner" in factory_names
render_spans = "spancat" in factory_names
render_parses( render_parses(
docs, docs,
displacy_path, displacy_path,
@ -129,6 +131,7 @@ def evaluate(
limit=displacy_limit, limit=displacy_limit,
deps=render_deps, deps=render_deps,
ents=render_ents, ents=render_ents,
spans=render_spans,
) )
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
@ -182,6 +185,7 @@ def render_parses(
limit: int = 250, limit: int = 250,
deps: bool = True, deps: bool = True,
ents: bool = True, ents: bool = True,
spans: bool = True,
): ):
docs[0].user_data["title"] = model_name docs[0].user_data["title"] = model_name
if ents: if ents:
@ -195,6 +199,11 @@ def render_parses(
with (output_path / "parses.html").open("w", encoding="utf8") as file_: with (output_path / "parses.html").open("w", encoding="utf8") as file_:
file_.write(html) file_.write(html)
if spans:
html = displacy.render(docs[:limit], style="span", page=True)
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
file_.write(html)
def print_prf_per_type( def print_prf_per_type(
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str

View File

@ -12,6 +12,7 @@ import srsly
from click import NoSuchOption from click import NoSuchOption
from packaging.specifiers import SpecifierSet from packaging.specifiers import SpecifierSet
from thinc.api import Config, ConfigValidationError from thinc.api import Config, ConfigValidationError
from spacy.tokens import DocBin
from spacy import about from spacy import about
from spacy.cli import info from spacy.cli import info
@ -27,6 +28,7 @@ from spacy.cli.debug_data import _get_span_characteristics
from spacy.cli.debug_data import _print_span_characteristics from spacy.cli.debug_data import _print_span_characteristics
from spacy.cli.debug_data import _get_spans_length_freq_dist from spacy.cli.debug_data import _get_spans_length_freq_dist
from spacy.cli.download import get_compatibility, get_version from spacy.cli.download import get_compatibility, get_version
from spacy.cli.evaluate import render_parses
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.init_pipeline import _init_labels from spacy.cli.init_pipeline import _init_labels
from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import get_third_party_dependencies
@ -144,6 +146,70 @@ def test_issue11235():
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}" assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
@pytest.mark.issue(12566)
@pytest.mark.parametrize(
"factory,output_file",
[("deps", "parses.html"), ("ents", "entities.html"), ("spans", "spans.html")],
)
def test_issue12566(factory: str, output_file: str):
"""
Test if all displaCy types (ents, dep, spans) produce an HTML file
"""
with make_tempdir() as tmp_dir:
# Create sample spaCy file
doc_json = {
"ents": [
{"end": 54, "label": "nam_adj_country", "start": 44},
{"end": 83, "label": "nam_liv_person", "start": 69},
{"end": 100, "label": "nam_pro_title_book", "start": 86},
],
"spans": {
"sc": [
{"end": 54, "kb_id": "", "label": "nam_adj_country", "start": 44},
{"end": 83, "kb_id": "", "label": "nam_liv_person", "start": 69},
{
"end": 100,
"kb_id": "",
"label": "nam_pro_title_book",
"start": 86,
},
]
},
"text": "Niedawno czytał em nową książkę znakomitego szkockiego medioznawcy , "
"Briana McNaira - Cultural Chaos .",
"tokens": [
# fmt: off
{"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, },
{"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, },
{"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, },
{"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, },
{"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, },
{"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, },
{"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, },
{"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, },
{"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, },
{"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, },
{"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, },
{"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, },
{"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, },
{"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, },
{"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, },
# fmt: on
],
}
# Create a .spacy file
nlp = spacy.blank("pl")
doc = Doc(nlp.vocab).from_json(doc_json)
# Run the evaluate command and check if the html files exist
render_parses(
docs=[doc], output_path=tmp_dir, model_name="", limit=1, **{factory: True}
)
assert (tmp_dir / output_file).is_file()
def test_cli_info(): def test_cli_info():
nlp = Dutch() nlp = Dutch()
nlp.add_pipe("textcat") nlp.add_pipe("textcat")