mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Add spans in spacy benchmark (#12575)
* Add spans in spacy benchmark The current implementation of spaCy benchmark accuracy / spacy evaluate doesn't include the "spans" type, so calling the command doesn't render the HTML displaCy file needed. This PR attempts to fix that by creating a new parameter for "spans" and calling the appropriate displaCy value. * Reformat file with black * Add tests for evaluate * Fix spans -> span for displacy style * Update test to check render instead * Update source so mypy passes * Add parser information to avoid warnings
This commit is contained in:
		
							parent
							
								
									139368d9ce
								
							
						
					
					
						commit
						9ec12fcfde
					
				|  | @ -122,6 +122,8 @@ def evaluate( | ||||||
|         docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) |         docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) | ||||||
|         render_deps = "parser" in factory_names |         render_deps = "parser" in factory_names | ||||||
|         render_ents = "ner" in factory_names |         render_ents = "ner" in factory_names | ||||||
|  |         render_spans = "spancat" in factory_names | ||||||
|  | 
 | ||||||
|         render_parses( |         render_parses( | ||||||
|             docs, |             docs, | ||||||
|             displacy_path, |             displacy_path, | ||||||
|  | @ -129,6 +131,7 @@ def evaluate( | ||||||
|             limit=displacy_limit, |             limit=displacy_limit, | ||||||
|             deps=render_deps, |             deps=render_deps, | ||||||
|             ents=render_ents, |             ents=render_ents, | ||||||
|  |             spans=render_spans, | ||||||
|         ) |         ) | ||||||
|         msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) |         msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) | ||||||
| 
 | 
 | ||||||
|  | @ -182,6 +185,7 @@ def render_parses( | ||||||
|     limit: int = 250, |     limit: int = 250, | ||||||
|     deps: bool = True, |     deps: bool = True, | ||||||
|     ents: bool = True, |     ents: bool = True, | ||||||
|  |     spans: bool = True, | ||||||
| ): | ): | ||||||
|     docs[0].user_data["title"] = model_name |     docs[0].user_data["title"] = model_name | ||||||
|     if ents: |     if ents: | ||||||
|  | @ -195,6 +199,11 @@ def render_parses( | ||||||
|         with (output_path / "parses.html").open("w", encoding="utf8") as file_: |         with (output_path / "parses.html").open("w", encoding="utf8") as file_: | ||||||
|             file_.write(html) |             file_.write(html) | ||||||
| 
 | 
 | ||||||
|  |     if spans: | ||||||
|  |         html = displacy.render(docs[:limit], style="span", page=True) | ||||||
|  |         with (output_path / "spans.html").open("w", encoding="utf8") as file_: | ||||||
|  |             file_.write(html) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def print_prf_per_type( | def print_prf_per_type( | ||||||
|     msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str |     msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str | ||||||
|  |  | ||||||
|  | @ -12,6 +12,7 @@ import srsly | ||||||
| from click import NoSuchOption | from click import NoSuchOption | ||||||
| from packaging.specifiers import SpecifierSet | from packaging.specifiers import SpecifierSet | ||||||
| from thinc.api import Config, ConfigValidationError | from thinc.api import Config, ConfigValidationError | ||||||
|  | from spacy.tokens import DocBin | ||||||
| 
 | 
 | ||||||
| from spacy import about | from spacy import about | ||||||
| from spacy.cli import info | from spacy.cli import info | ||||||
|  | @ -27,6 +28,7 @@ from spacy.cli.debug_data import _get_span_characteristics | ||||||
| from spacy.cli.debug_data import _print_span_characteristics | from spacy.cli.debug_data import _print_span_characteristics | ||||||
| from spacy.cli.debug_data import _get_spans_length_freq_dist | from spacy.cli.debug_data import _get_spans_length_freq_dist | ||||||
| from spacy.cli.download import get_compatibility, get_version | from spacy.cli.download import get_compatibility, get_version | ||||||
|  | from spacy.cli.evaluate import render_parses | ||||||
| from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config | from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config | ||||||
| from spacy.cli.init_pipeline import _init_labels | from spacy.cli.init_pipeline import _init_labels | ||||||
| from spacy.cli.package import get_third_party_dependencies | from spacy.cli.package import get_third_party_dependencies | ||||||
|  | @ -144,6 +146,70 @@ def test_issue11235(): | ||||||
|     assert cfg["commands"][0]["script"][0] == f"hello {lang_var}" |     assert cfg["commands"][0]["script"][0] == f"hello {lang_var}" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.issue(12566) | ||||||
|  | @pytest.mark.parametrize( | ||||||
|  |     "factory,output_file", | ||||||
|  |     [("deps", "parses.html"), ("ents", "entities.html"), ("spans", "spans.html")], | ||||||
|  | ) | ||||||
|  | def test_issue12566(factory: str, output_file: str): | ||||||
|  |     """ | ||||||
|  |     Test if all displaCy types (ents, dep, spans) produce an HTML file | ||||||
|  |     """ | ||||||
|  |     with make_tempdir() as tmp_dir: | ||||||
|  |         # Create sample spaCy file | ||||||
|  |         doc_json = { | ||||||
|  |             "ents": [ | ||||||
|  |                 {"end": 54, "label": "nam_adj_country", "start": 44}, | ||||||
|  |                 {"end": 83, "label": "nam_liv_person", "start": 69}, | ||||||
|  |                 {"end": 100, "label": "nam_pro_title_book", "start": 86}, | ||||||
|  |             ], | ||||||
|  |             "spans": { | ||||||
|  |                 "sc": [ | ||||||
|  |                     {"end": 54, "kb_id": "", "label": "nam_adj_country", "start": 44}, | ||||||
|  |                     {"end": 83, "kb_id": "", "label": "nam_liv_person", "start": 69}, | ||||||
|  |                     { | ||||||
|  |                         "end": 100, | ||||||
|  |                         "kb_id": "", | ||||||
|  |                         "label": "nam_pro_title_book", | ||||||
|  |                         "start": 86, | ||||||
|  |                     }, | ||||||
|  |                 ] | ||||||
|  |             }, | ||||||
|  |             "text": "Niedawno czytał em nową książkę znakomitego szkockiego medioznawcy , " | ||||||
|  |             "Briana McNaira - Cultural Chaos .", | ||||||
|  |             "tokens": [ | ||||||
|  |                 # fmt: off | ||||||
|  |                 {"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, }, | ||||||
|  |                 {"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, }, | ||||||
|  |                 {"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, }, | ||||||
|  |                 {"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, }, | ||||||
|  |                 {"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, }, | ||||||
|  |                 {"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, }, | ||||||
|  |                 {"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, }, | ||||||
|  |                 {"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, }, | ||||||
|  |                 {"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, }, | ||||||
|  |                 {"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, }, | ||||||
|  |                 {"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, }, | ||||||
|  |                 {"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, }, | ||||||
|  |                 {"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, }, | ||||||
|  |                 {"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, }, | ||||||
|  |                 {"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, }, | ||||||
|  |                 # fmt: on | ||||||
|  |             ], | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         # Create a .spacy file | ||||||
|  |         nlp = spacy.blank("pl") | ||||||
|  |         doc = Doc(nlp.vocab).from_json(doc_json) | ||||||
|  | 
 | ||||||
|  |         # Run the evaluate command and check if the html files exist | ||||||
|  |         render_parses( | ||||||
|  |             docs=[doc], output_path=tmp_dir, model_name="", limit=1, **{factory: True} | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         assert (tmp_dir / output_file).is_file() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_cli_info(): | def test_cli_info(): | ||||||
|     nlp = Dutch() |     nlp = Dutch() | ||||||
|     nlp.add_pipe("textcat") |     nlp.add_pipe("textcat") | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user