From 135de82a2d7073d535d1ffd1e4254e5dca37c046 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 22 Sep 2020 10:22:06 +0200 Subject: [PATCH 01/47] add textcat to quickstart --- spacy/cli/templates/quickstart_training.jinja | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 0db4c8a59..2c7ce024b 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -93,6 +93,29 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" {% endif -%} +{% if "textcat" in components %} +[components.textcat] +factory = "textcat" + +{% if optimize == "accuracy" %} +[components.textcat.model] +@architectures = "spacy.TextCatEnsemble.v1" +exclusive_classes = false +width = 64 +conv_depth = 2 +embed_size = 2000 +window_size = 1 +ngram_size = 1 +nO = null + +{% else -%} +[components.textcat.model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +{%- endif %} +{%- endif %} + {# NON-TRANSFORMER PIPELINE #} {% else -%} @@ -167,10 +190,33 @@ nO = null @architectures = "spacy.Tok2VecListener.v1" width = ${components.tok2vec.model.encode.width} {% endif %} + +{% if "textcat" in components %} +[components.textcat] +factory = "textcat" + +{% if optimize == "accuracy" %} +[components.textcat.model] +@architectures = "spacy.TextCatEnsemble.v1" +exclusive_classes = false +width = 64 +conv_depth = 2 +embed_size = 2000 +window_size = 1 +ngram_size = 1 +nO = null + +{% else -%} +[components.textcat.model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +{%- endif %} +{%- endif %} {% endif %} {% for pipe in components %} -{% if pipe not in ["tagger", "parser", "ner"] %} +{% if pipe not in ["tagger", "parser", "ner", "textcat"] %} {# Other components defined by the user: we just assume they're factories #} [components.{{ pipe }}] factory = "{{ pipe }}" From db7126ead9675d70212c33ab9f09d2f67d72cf77 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 22 Sep 2020 10:31:26 +0200 Subject: [PATCH 02/47] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index ec3c168a5..b57bbeda2 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a20" +__version__ = "3.0.0a21" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 396b33257f7dff646040067c2ed7872d8c194f8b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 22 Sep 2020 10:40:05 +0200 Subject: [PATCH 03/47] add entity_linker to jinja template --- spacy/cli/init_config.py | 2 +- spacy/cli/templates/quickstart_training.jinja | 34 ++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index e70195e15..5203c5dbb 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -36,7 +36,7 @@ def init_config_cli( """ Generate a starter config.cfg for training. Based on your requirements specified via the CLI arguments, this command generates a config with the - optimal settings for you use case. This includes the choice of architecture, + optimal settings for your use case. This includes the choice of architecture, pretrained weights and related hyperparameters. DOCS: https://nightly.spacy.io/api/cli#init-config diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 2c7ce024b..0674f0964 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -93,6 +93,22 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" {% endif -%} +{% if "entity_linker" in components -%} +[components.entity_linker] +factory = "entity_linker" +get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} +incl_context = true +incl_prior = true + +[components.entity_linker.model] +@architectures = "spacy.EntityLinker.v1" +nO = null + +[components.entity_linker.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +{% endif -%} + {% if "textcat" in components %} [components.textcat] factory = "textcat" @@ -191,6 +207,22 @@ nO = null width = ${components.tok2vec.model.encode.width} {% endif %} +{% if "entity_linker" in components -%} +[components.entity_linker] +factory = "entity_linker" +get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} +incl_context = true +incl_prior = true + +[components.entity_linker.model] +@architectures = "spacy.EntityLinker.v1" +nO = null + +[components.entity_linker.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +{% endif %} + {% if "textcat" in components %} [components.textcat] factory = "textcat" @@ -216,7 +248,7 @@ ngram_size = 1 {% endif %} {% for pipe in components %} -{% if pipe not in ["tagger", "parser", "ner", "textcat"] %} +{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %} {# Other components defined by the user: we just assume they're factories #} [components.{{ pipe }}] factory = "{{ pipe }}" From e931f4d75771dc63b2573e2cbd7c834de96def7d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 22 Sep 2020 10:56:43 +0200 Subject: [PATCH 04/47] add textcat score --- spacy/cli/templates/quickstart_training.jinja | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 0674f0964..0e83b9bdb 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -323,3 +323,6 @@ ents_f = {{ (1.0 / components|length)|round(2) }} ents_p = 0.0 ents_r = 0.0 {%- endif -%} +{%- if "textcat" in components %} +cats_score = {{ (1.0 / components|length)|round(2) }} +{%- endif -%} From b556a1080893202651d473fc93c4b9010ee01665 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 22 Sep 2020 11:50:19 +0200 Subject: [PATCH 05/47] rename converts in_to_out --- spacy/cli/_util.py | 4 +-- spacy/cli/convert.py | 14 ++++----- spacy/errors.py | 2 +- spacy/tests/regression/test_issue4001-4500.py | 4 +-- spacy/tests/regression/test_issue4501-5000.py | 6 ++-- spacy/tests/test_cli.py | 30 +++++++++---------- spacy/tests/test_scorer.py | 6 ++-- spacy/tests/training/test_training.py | 26 ++++++++-------- spacy/training/__init__.py | 4 +-- spacy/training/converters/__init__.py | 8 ++--- ...conll_ner2docs.py => conll_ner_to_docs.py} | 2 +- .../{conllu2docs.py => conllu_to_docs.py} | 12 ++++---- .../{iob2docs.py => iob_to_docs.py} | 4 +-- .../{json2docs.py => json_to_docs.py} | 6 ++-- spacy/training/example.pyx | 18 +++++------ spacy/training/gold_io.pyx | 4 +-- spacy/training/iob_utils.py | 14 ++++----- website/docs/api/data-formats.md | 2 +- website/docs/api/top-level.md | 18 +++++------ website/docs/usage/processing-pipelines.md | 6 ++-- website/docs/usage/v3.md | 15 +++++----- 21 files changed, 103 insertions(+), 102 deletions(-) rename spacy/training/converters/{conll_ner2docs.py => conll_ner_to_docs.py} (99%) rename spacy/training/converters/{conllu2docs.py => conllu_to_docs.py} (97%) rename spacy/training/converters/{iob2docs.py => iob_to_docs.py} (95%) rename spacy/training/converters/{json2docs.py => json_to_docs.py} (82%) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 797a701b9..21a4e54ce 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -378,7 +378,7 @@ def git_sparse_checkout(repo, subpath, dest, branch): # Looking for this 'rev-list' command in the git --help? Hah. cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" ret = run_command(cmd, capture=True) - git_repo = _from_http_to_git(repo) + git_repo = _http_to_git(repo) # Now pass those missings into another bit of git internals missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) if not missings: @@ -414,7 +414,7 @@ def get_git_version( return (int(version[0]), int(version[1])) -def _from_http_to_git(repo: str) -> str: +def _http_to_git(repo: str) -> str: if repo.startswith("http://"): repo = repo.replace(r"http://", r"https://") if repo.startswith(r"https://"): diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index ad89b9976..8f8234c61 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -9,7 +9,7 @@ import sys from ._util import app, Arg, Opt from ..training import docs_to_json from ..tokens import DocBin -from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs +from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs # Converters are matched by file extension except for ner/iob, which are @@ -18,12 +18,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do # imported from /converters. CONVERTERS = { - "conllubio": conllu2docs, - "conllu": conllu2docs, - "conll": conllu2docs, - "ner": conll_ner2docs, - "iob": iob2docs, - "json": json2docs, + "conllubio": conllu_to_docs, + "conllu": conllu_to_docs, + "conll": conllu_to_docs, + "ner": conll_ner_to_docs, + "iob": iob_to_docs, + "json": json_to_docs, } diff --git a/spacy/errors.py b/spacy/errors.py index f276c4d1a..153f8da0c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -69,7 +69,7 @@ class Warnings: "in problems with the vocab further on in the pipeline.") W030 = ("Some entities could not be aligned in the text \"{text}\" with " "entities \"{entities}\". Use " - "`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" + "`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`" " to check the alignment. Misaligned entities ('-') will be " "ignored during training.") W033 = ("Training a new {model} using a model with no lexeme normalization " diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 4e58c347e..7b7ddfe0d 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -3,7 +3,7 @@ from spacy.pipeline import Pipe from spacy.matcher import PhraseMatcher, Matcher from spacy.tokens import Doc, Span, DocBin from spacy.training import Example, Corpus -from spacy.training.converters import json2docs +from spacy.training.converters import json_to_docs from spacy.vocab import Vocab from spacy.lang.en import English from spacy.util import minibatch, ensure_path, load_model @@ -425,7 +425,7 @@ def test_issue4402(): attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] with make_tempdir() as tmpdir: output_file = tmpdir / "test4402.spacy" - docs = json2docs([json_data]) + docs = json_to_docs([json_data]) data = DocBin(docs=docs, attrs=attrs).to_bytes() with output_file.open("wb") as file_: file_.write(data) diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index 9454d7f0c..e351858f5 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -1,7 +1,7 @@ import pytest from spacy.tokens import Doc, Span, DocBin from spacy.training import Example -from spacy.training.converters.conllu2docs import conllu2docs +from spacy.training.converters.conllu_to_docs import conllu_to_docs from spacy.lang.en import English from spacy.kb import KnowledgeBase from spacy.vocab import Vocab @@ -82,7 +82,7 @@ def test_issue4651_without_phrase_matcher_attr(): def test_issue4665(): """ - conllu2json should not raise an exception if the HEAD column contains an + conllu_to_docs should not raise an exception if the HEAD column contains an underscore """ input_data = """ @@ -105,7 +105,7 @@ def test_issue4665(): 17 . _ PUNCT . _ _ punct _ _ 18 ] _ PUNCT -RRB- _ _ punct _ _ """ - conllu2docs(input_data) + conllu_to_docs(input_data) def test_issue4674(): diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index a9c9d8ca5..7141a11ff 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,7 +1,7 @@ import pytest from click import NoSuchOption -from spacy.training import docs_to_json, biluo_tags_from_offsets -from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs +from spacy.training import docs_to_json, offsets_to_biluo_tags +from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides @@ -14,7 +14,7 @@ import os from .util import make_tempdir -def test_cli_converters_conllu2json(): +def test_cli_converters_conllu_to_json(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", @@ -23,7 +23,7 @@ def test_cli_converters_conllu2json(): "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", ] input_data = "\n".join(lines) - converted_docs = conllu2docs(input_data, n_sents=1) + converted_docs = conllu_to_docs(input_data, n_sents=1) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 @@ -39,7 +39,7 @@ def test_cli_converters_conllu2json(): ent_offsets = [ (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] ] - biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PER", "L-PER", "O"] @@ -62,9 +62,9 @@ def test_cli_converters_conllu2json(): ), ], ) -def test_cli_converters_conllu2json_name_ner_map(lines): +def test_cli_converters_conllu_to_json_name_ner_map(lines): input_data = "\n".join(lines) - converted_docs = conllu2docs( + converted_docs = conllu_to_docs( input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""} ) assert len(converted_docs) == 1 @@ -83,11 +83,11 @@ def test_cli_converters_conllu2json_name_ner_map(lines): ent_offsets = [ (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] ] - biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] -def test_cli_converters_conllu2json_subtokens(): +def test_cli_converters_conllu_to_json_subtokens(): # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", @@ -98,7 +98,7 @@ def test_cli_converters_conllu2json_subtokens(): "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", ] input_data = "\n".join(lines) - converted_docs = conllu2docs( + converted_docs = conllu_to_docs( input_data, n_sents=1, merge_subtokens=True, append_morphology=True ) assert len(converted_docs) == 1 @@ -132,11 +132,11 @@ def test_cli_converters_conllu2json_subtokens(): ent_offsets = [ (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] ] - biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "U-PER", "O", "O"] -def test_cli_converters_iob2json(): +def test_cli_converters_iob_to_docs(): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", @@ -144,7 +144,7 @@ def test_cli_converters_iob2json(): "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O", ] input_data = "\n".join(lines) - converted_docs = iob2docs(input_data, n_sents=10) + converted_docs = iob_to_docs(input_data, n_sents=10) assert len(converted_docs) == 1 converted = docs_to_json(converted_docs) assert converted["id"] == 0 @@ -161,7 +161,7 @@ def test_cli_converters_iob2json(): assert ent.text in ["New York City", "London"] -def test_cli_converters_conll_ner2json(): +def test_cli_converters_conll_ner_to_docs(): lines = [ "-DOCSTART- -X- O O", "", @@ -211,7 +211,7 @@ def test_cli_converters_conll_ner2json(): ".\t.\t_\tO", ] input_data = "\n".join(lines) - converted_docs = conll_ner2docs(input_data, n_sents=10) + converted_docs = conll_ner_to_docs(input_data, n_sents=10) assert len(converted_docs) == 1 converted = docs_to_json(converted_docs) assert converted["id"] == 0 diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index a1406c14a..2825f1703 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -2,7 +2,7 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx from spacy.training import Example -from spacy.training.iob_utils import biluo_tags_from_offsets +from spacy.training.iob_utils import offsets_to_biluo_tags from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from spacy.lang.en import English @@ -186,7 +186,7 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)], ) - entities = biluo_tags_from_offsets(doc, annot["entities"]) + entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) # a hack for sentence boundaries example.predicted[1].is_sent_start = False @@ -211,7 +211,7 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)], ) - entities = biluo_tags_from_offsets(doc, annot["entities"]) + entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) # a hack for sentence boundaries example.predicted[1].is_sent_start = False diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 4cab5b015..a04e6aadd 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -1,9 +1,9 @@ import numpy -from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment -from spacy.training import spans_from_biluo_tags, iob_to_biluo +from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment +from spacy.training import biluo_tags_to_spans, iob_to_biluo from spacy.training import Corpus, docs_to_json from spacy.training.example import Example -from spacy.training.converters import json2docs +from spacy.training.converters import json_to_docs from spacy.training.augment import make_orth_variants_example from spacy.lang.en import English from spacy.tokens import Doc, DocBin @@ -69,7 +69,7 @@ def test_gold_biluo_U(en_vocab): spaces = [True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to London"), "LOC")] - tags = biluo_tags_from_offsets(doc, entities) + tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "U-LOC", "O"] @@ -78,7 +78,7 @@ def test_gold_biluo_BL(en_vocab): spaces = [True, True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")] - tags = biluo_tags_from_offsets(doc, entities) + tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"] @@ -87,7 +87,7 @@ def test_gold_biluo_BIL(en_vocab): spaces = [True, True, True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - tags = biluo_tags_from_offsets(doc, entities) + tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] @@ -100,7 +100,7 @@ def test_gold_biluo_overlap(en_vocab): (len("I flew to "), len("I flew to San Francisco"), "LOC"), ] with pytest.raises(ValueError): - biluo_tags_from_offsets(doc, entities) + offsets_to_biluo_tags(doc, entities) def test_gold_biluo_misalign(en_vocab): @@ -109,7 +109,7 @@ def test_gold_biluo_misalign(en_vocab): doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] with pytest.warns(UserWarning): - tags = biluo_tags_from_offsets(doc, entities) + tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "-", "-", "-"] @@ -155,7 +155,7 @@ def test_example_from_dict_some_ner(en_vocab): @pytest.mark.filterwarnings("ignore::UserWarning") -def test_json2docs_no_ner(en_vocab): +def test_json_to_docs_no_ner(en_vocab): data = [ { "id": 1, @@ -191,7 +191,7 @@ def test_json2docs_no_ner(en_vocab): ], } ] - docs = json2docs(data) + docs = json_to_docs(data) assert len(docs) == 1 for doc in docs: assert not doc.has_annotation("ENT_IOB") @@ -358,9 +358,9 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer): biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] offsets = [(10, 24, "LOC"), (29, 35, "GPE")] doc = en_tokenizer(text) - biluo_tags_converted = biluo_tags_from_offsets(doc, offsets) + biluo_tags_converted = offsets_to_biluo_tags(doc, offsets) assert biluo_tags_converted == biluo_tags - offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) + offsets_converted = biluo_tags_to_offsets(doc, biluo_tags) offsets_converted = [ent for ent in offsets if ent[2]] assert offsets_converted == offsets @@ -368,7 +368,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer): def test_biluo_spans(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] - spans = spans_from_biluo_tags(doc, biluo_tags) + spans = biluo_tags_to_spans(doc, biluo_tags) spans = [span for span in spans if span.label_] assert len(spans) == 2 assert spans[0].text == "Silicon Valley" diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 35e67f696..9172dde25 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -2,8 +2,8 @@ from .corpus import Corpus # noqa: F401 from .example import Example, validate_examples # noqa: F401 from .align import Alignment # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 -from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401 -from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401 +from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401 +from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401 from .gold_io import docs_to_json, read_json_file # noqa: F401 from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401 from .loggers import console_logger, wandb_logger # noqa: F401 diff --git a/spacy/training/converters/__init__.py b/spacy/training/converters/__init__.py index 15f025a08..e91b6aaa6 100644 --- a/spacy/training/converters/__init__.py +++ b/spacy/training/converters/__init__.py @@ -1,4 +1,4 @@ -from .iob2docs import iob2docs # noqa: F401 -from .conll_ner2docs import conll_ner2docs # noqa: F401 -from .json2docs import json2docs # noqa: F401 -from .conllu2docs import conllu2docs # noqa: F401 +from .iob_to_docs import iob_to_docs # noqa: F401 +from .conll_ner_to_docs import conll_ner_to_docs # noqa: F401 +from .json_to_docs import json_to_docs # noqa: F401 +from .conllu_to_docs import conllu_to_docs # noqa: F401 diff --git a/spacy/training/converters/conll_ner2docs.py b/spacy/training/converters/conll_ner_to_docs.py similarity index 99% rename from spacy/training/converters/conll_ner2docs.py rename to spacy/training/converters/conll_ner_to_docs.py index 8dcaf2599..3b851039c 100644 --- a/spacy/training/converters/conll_ner2docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -7,7 +7,7 @@ from ...tokens import Doc, Span from ...util import load_model -def conll_ner2docs( +def conll_ner_to_docs( input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs ): """ diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu_to_docs.py similarity index 97% rename from spacy/training/converters/conllu2docs.py rename to spacy/training/converters/conllu_to_docs.py index b4d8b3ac4..18a2b6a93 100644 --- a/spacy/training/converters/conllu2docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -1,13 +1,13 @@ import re -from .conll_ner2docs import n_sents_info -from ...training import iob_to_biluo, spans_from_biluo_tags +from .conll_ner_to_docs import n_sents_info +from ...training import iob_to_biluo, biluo_tags_to_spans from ...tokens import Doc, Token, Span from ...vocab import Vocab from wasabi import Printer -def conllu2docs( +def conllu_to_docs( input_data, n_sents=10, append_morphology=False, @@ -78,7 +78,7 @@ def read_conllx( if lines: while lines[0].startswith("#"): lines.pop(0) - doc = doc_from_conllu_sentence( + doc = conllu_sentence_to_doc( vocab, lines, ner_tag_pattern, @@ -128,7 +128,7 @@ def get_entities(lines, tag_pattern, ner_map=None): return iob_to_biluo(iob) -def doc_from_conllu_sentence( +def conllu_sentence_to_doc( vocab, lines, ner_tag_pattern, @@ -215,7 +215,7 @@ def doc_from_conllu_sentence( doc[i]._.merged_lemma = lemmas[i] doc[i]._.merged_spaceafter = spaces[i] ents = get_entities(lines, ner_tag_pattern, ner_map) - doc.ents = spans_from_biluo_tags(doc, ents) + doc.ents = biluo_tags_to_spans(doc, ents) if merge_subtokens: doc = merge_conllu_subtokens(lines, doc) diff --git a/spacy/training/converters/iob2docs.py b/spacy/training/converters/iob_to_docs.py similarity index 95% rename from spacy/training/converters/iob2docs.py rename to spacy/training/converters/iob_to_docs.py index 2f6742fea..bfd981649 100644 --- a/spacy/training/converters/iob2docs.py +++ b/spacy/training/converters/iob_to_docs.py @@ -1,13 +1,13 @@ from wasabi import Printer -from .conll_ner2docs import n_sents_info +from .conll_ner_to_docs import n_sents_info from ...vocab import Vocab from ...training import iob_to_biluo, tags_to_entities from ...tokens import Doc, Span from ...util import minibatch -def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs): +def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs): """ Convert IOB files with one sentence per line and tags separated with '|' into Doc objects so they can be saved. IOB and IOB2 are accepted. diff --git a/spacy/training/converters/json2docs.py b/spacy/training/converters/json_to_docs.py similarity index 82% rename from spacy/training/converters/json2docs.py rename to spacy/training/converters/json_to_docs.py index 342f94848..d7df1d6f9 100644 --- a/spacy/training/converters/json2docs.py +++ b/spacy/training/converters/json_to_docs.py @@ -1,12 +1,12 @@ import srsly from ..gold_io import json_iterate, json_to_annotations -from ..example import annotations2doc +from ..example import annotations_to_doc from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ...util import load_model from ...lang.xx import MultiLanguage -def json2docs(input_data, model=None, **kwargs): +def json_to_docs(input_data, model=None, **kwargs): nlp = load_model(model) if model is not None else MultiLanguage() if not isinstance(input_data, bytes): if not isinstance(input_data, str): @@ -17,6 +17,6 @@ def json2docs(input_data, model=None, **kwargs): for json_para in json_to_annotations(json_doc): example_dict = _fix_legacy_dict_data(json_para) tok_dict, doc_dict = _parse_example_dict_data(example_dict) - doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) + doc = annotations_to_doc(nlp.vocab, tok_dict, doc_dict) docs.append(doc) return docs diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 371b4a06a..fbf05b224 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -7,13 +7,13 @@ from ..tokens.span cimport Span from ..tokens.span import Span from ..attrs import IDS from .align import Alignment -from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc -from .iob_utils import spans_from_biluo_tags +from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags +from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings from ..pipeline._parser_internals import nonproj -cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): +cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): """ Create a Doc from dictionaries with token and doc annotations. """ attrs, array = _annot2array(vocab, tok_annot, doc_annot) output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) @@ -92,7 +92,7 @@ cdef class Example: tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] return Example( predicted, - annotations2doc(predicted.vocab, tok_dict, doc_dict) + annotations_to_doc(predicted.vocab, tok_dict, doc_dict) ) @property @@ -176,7 +176,7 @@ cdef class Example: return [None] * len(self.x) # should this be 'missing' instead of 'None' ? x_ents = self.get_aligned_spans_y2x(self.y.ents) # Default to 'None' for missing values - x_tags = biluo_tags_from_offsets( + x_tags = offsets_to_biluo_tags( self.x, [(e.start_char, e.end_char, e.label_) for e in x_ents], missing=None @@ -195,7 +195,7 @@ cdef class Example: return { "doc_annotation": { "cats": dict(self.reference.cats), - "entities": biluo_tags_from_doc(self.reference), + "entities": doc_to_biluo_tags(self.reference), "links": self._links_to_dict() }, "token_annotation": { @@ -295,12 +295,12 @@ def _add_entities_to_doc(doc, ner_data): elif isinstance(ner_data[0], tuple): return _add_entities_to_doc( doc, - biluo_tags_from_offsets(doc, ner_data) + offsets_to_biluo_tags(doc, ner_data) ) elif isinstance(ner_data[0], str) or ner_data[0] is None: return _add_entities_to_doc( doc, - spans_from_biluo_tags(doc, ner_data) + biluo_tags_to_spans(doc, ner_data) ) elif isinstance(ner_data[0], Span): # Ugh, this is super messy. Really hard to set O entities @@ -388,7 +388,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): # This is annoying but to convert the offsets we need a Doc # that has the target tokenization. reference = Doc(vocab, words=words, spaces=spaces) - biluo = biluo_tags_from_offsets(reference, biluo_or_offsets) + biluo = offsets_to_biluo_tags(reference, biluo_or_offsets) else: biluo = biluo_or_offsets ent_iobs = [] diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx index b58df0d71..524da0a16 100644 --- a/spacy/training/gold_io.pyx +++ b/spacy/training/gold_io.pyx @@ -3,7 +3,7 @@ import srsly from .. import util from ..errors import Warnings from ..tokens import Doc -from .iob_utils import biluo_tags_from_offsets, tags_to_entities +from .iob_utils import offsets_to_biluo_tags, tags_to_entities import json @@ -32,7 +32,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): if ent.kb_id_: link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} json_para["links"].append(link_dict) - biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag) + biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag) attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB") include_annotation = {attr: doc.has_annotation(attr) for attr in attrs} for j, sent in enumerate(doc.sents): diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index ceb5e16b8..63deed3a5 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -50,15 +50,15 @@ def _consume_ent(tags): return [start] + middle + [end] -def biluo_tags_from_doc(doc, missing="O"): - return biluo_tags_from_offsets( +def doc_to_biluo_tags(doc, missing="O"): + return offsets_to_biluo_tags( doc, [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], missing=missing, ) -def biluo_tags_from_offsets(doc, entities, missing="O"): +def offsets_to_biluo_tags(doc, entities, missing="O"): """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO). @@ -80,7 +80,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): >>> text = 'I like London.' >>> entities = [(len('I like '), len('I like London'), 'LOC')] >>> doc = nlp.tokenizer(text) - >>> tags = biluo_tags_from_offsets(doc, entities) + >>> tags = offsets_to_biluo_tags(doc, entities) >>> assert tags == ["O", "O", 'U-LOC', "O"] """ # Ensure no overlapping entity labels exist @@ -143,7 +143,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): return biluo -def spans_from_biluo_tags(doc, tags): +def biluo_tags_to_spans(doc, tags): """Encode per-token tags following the BILUO scheme into Span object, e.g. to overwrite the doc.ents. @@ -161,7 +161,7 @@ def spans_from_biluo_tags(doc, tags): return spans -def offsets_from_biluo_tags(doc, tags): +def biluo_tags_to_offsets(doc, tags): """Encode per-token tags following the BILUO scheme into entity offsets. doc (Doc): The document that the BILUO tags refer to. @@ -172,7 +172,7 @@ def offsets_from_biluo_tags(doc, tags): `end` will be character-offset integers denoting the slice into the original string. """ - spans = spans_from_biluo_tags(doc, tags) + spans = biluo_tags_to_spans(doc, tags) return [(span.start_char, span.end_char, span.label_) for span in spans] diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 3a214428b..e3b3900be 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -275,7 +275,7 @@ $ python -m spacy convert ./data.json ./output.spacy > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token > representing a `PERSON` entity. The -> [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) function +> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function > can help you convert entity offsets to the right format. ```python diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 7afe02403..2c082ae0b 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -619,7 +619,7 @@ sequences in the batch. ## Training data and alignment {#gold source="spacy/training"} -### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} +### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"} Encode labelled spans into per-token tags, using the [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, @@ -635,11 +635,11 @@ single-token entity. > #### Example > > ```python -> from spacy.training import biluo_tags_from_offsets +> from spacy.training import offsets_to_biluo_tags > > doc = nlp("I like London.") > entities = [(7, 13, "LOC")] -> tags = biluo_tags_from_offsets(doc, entities) +> tags = offsets_to_biluo_tags(doc, entities) > assert tags == ["O", "O", "U-LOC", "O"] > ``` @@ -649,7 +649,7 @@ single-token entity. | `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ | | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ | -### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} +### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"} Encode per-token tags following the [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets. @@ -657,11 +657,11 @@ Encode per-token tags following the > #### Example > > ```python -> from spacy.training import offsets_from_biluo_tags +> from spacy.training import biluo_tags_to_offsets > > doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] -> entities = offsets_from_biluo_tags(doc, tags) +> entities = biluo_tags_to_offsets(doc, tags) > assert entities == [(7, 13, "LOC")] > ``` @@ -671,7 +671,7 @@ Encode per-token tags following the | `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ | -### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} +### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"} Encode per-token tags following the [BILUO scheme](/usage/linguistic-features#accessing-ner) into @@ -681,11 +681,11 @@ token-based tags, e.g. to overwrite the `doc.ents`. > #### Example > > ```python -> from spacy.training import spans_from_biluo_tags +> from spacy.training import biluo_tags_to_spans > > doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] -> doc.ents = spans_from_biluo_tags(doc, tags) +> doc.ents = biluo_tags_to_spans(doc, tags) > ``` | Name | Description | diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 3d756215f..97806dc2a 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1501,7 +1501,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline component function and pass it the token texts from the `Doc` object received by the component. -The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very +The [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans) is very helpful here, because it takes a `Doc` object and token-based BILUO tags and returns a sequence of `Span` objects in the `Doc` with added labels. So all your wrapper has to do is compute the entity spans and overwrite the `doc.ents`. @@ -1516,14 +1516,14 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`. ```python ### {highlight="1,8-9"} import your_custom_entity_recognizer -from spacy.training import offsets_from_biluo_tags +from spacy.training import biluo_tags_to_spans from spacy.language import Language @Language.component("custom_ner_wrapper") def custom_ner_wrapper(doc): words = [token.text for token in doc] custom_entities = your_custom_entity_recognizer(words) - doc.ents = spans_from_biluo_tags(doc, custom_entities) + doc.ents = biluo_tags_to_spans(doc, custom_entities) return doc ``` diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 406ba4b75..b3c586fe1 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -968,16 +968,17 @@ python -m spacy package ./output ./packages #### Data utilities and gold module {#migrating-gold} -The `spacy.gold` module has been renamed to `spacy.training`. This mostly +The `spacy.gold` module has been renamed to `spacy.training` and the conversion +utilities now follow the naming format of `x_to_y`. This mostly affects internals, but if you've been using the span offset conversion utilities -[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets), -[`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or -[`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to -change your imports: +[`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags), +[`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or +[`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to +change your names and imports: ```diff -- from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags -+ from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags +- from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, spans_from_biluo_tags ++ from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, biluo_tags_to_spans ``` #### Migration notes for plugin maintainers {#migrating-plugins} From e1b8090b9bdc880ede79bab5f269e3c352e17183 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 22 Sep 2020 12:01:06 +0200 Subject: [PATCH 06/47] few more fixes --- spacy/tests/test_cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 7141a11ff..99e83eccf 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -14,7 +14,7 @@ import os from .util import make_tempdir -def test_cli_converters_conllu_to_json(): +def test_cli_converters_conllu_to_docs(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", @@ -62,7 +62,7 @@ def test_cli_converters_conllu_to_json(): ), ], ) -def test_cli_converters_conllu_to_json_name_ner_map(lines): +def test_cli_converters_conllu_to_docs_name_ner_map(lines): input_data = "\n".join(lines) converted_docs = conllu_to_docs( input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""} @@ -87,7 +87,7 @@ def test_cli_converters_conllu_to_json_name_ner_map(lines): assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] -def test_cli_converters_conllu_to_json_subtokens(): +def test_cli_converters_conllu_to_docs_subtokens(): # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", From 085a1c8e2b4b3a136025ef693bb6e7537d88729f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 22 Sep 2020 12:06:40 +0200 Subject: [PATCH 07/47] add no_output_layer to TextCatBOW config --- spacy/cli/templates/quickstart_training.jinja | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 0e83b9bdb..a0d9f78ac 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -129,6 +129,7 @@ nO = null @architectures = "spacy.TextCatBOW.v1" exclusive_classes = false ngram_size = 1 +no_output_layer = false {%- endif %} {%- endif %} @@ -243,6 +244,7 @@ nO = null @architectures = "spacy.TextCatBOW.v1" exclusive_classes = false ngram_size = 1 +no_output_layer = false {%- endif %} {%- endif %} {% endif %} From 5e3b796b122fc9b1125f350b5dcda625fd9740f0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 22 Sep 2020 12:24:39 +0200 Subject: [PATCH 08/47] Validate section refs in debug config --- spacy/cli/debug_config.py | 27 +++++++++++++++++++++++++-- spacy/tests/test_cli.py | 15 ++++++++++++++- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 7930d0674..d07a0bb2d 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List from pathlib import Path from wasabi import msg, table from thinc.api import Config -from thinc.config import VARIABLE_RE +from thinc.config import VARIABLE_RE, ConfigValidationError import typer from ._util import Arg, Opt, show_validation_error, parse_config_overrides @@ -51,7 +51,10 @@ def debug_config( msg.divider("Config validation") with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) - nlp, _ = util.load_model_from_config(config) + nlp, resolved = util.load_model_from_config(config) + # Use the resolved config here in case user has one function returning + # a dict of corpora etc. + check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"]) msg.good("Config is valid") if show_vars: variables = get_variables(config) @@ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]: value = util.dot_to_object(config, path) result[variable] = repr(value) return result + + +def check_section_refs(config: Config, fields: List[str]) -> None: + """Validate fields in the config that refer to other sections or values + (e.g. in the corpora) and make sure that those references exist. + """ + errors = [] + for field in fields: + # If the field doesn't exist in the config, we ignore it + try: + value = util.dot_to_object(config, field) + except KeyError: + continue + try: + util.dot_to_object(config, value) + except KeyError: + msg = f"not a valid section reference: {value}" + errors.append({"loc": field.split("."), "msg": msg}) + if errors: + raise ConfigValidationError(config, errors) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index a9c9d8ca5..1bc246fef 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -7,7 +7,8 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR -from thinc.config import ConfigValidationError +from spacy.cli.debug_config import check_section_refs +from thinc.config import ConfigValidationError, Config import srsly import os @@ -413,3 +414,15 @@ def test_string_to_list(value): def test_string_to_list_intify(value): assert string_to_list(value, intify=False) == ["1", "2", "3"] assert string_to_list(value, intify=True) == [1, 2, 3] + + +def test_check_section_refs(): + config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}} + config = Config(config) + # Valid section reference + check_section_refs(config, ["a.b.c"]) + # Section that doesn't exist in this config + check_section_refs(config, ["x.y.z"]) + # Invalid section reference + with pytest.raises(ConfigValidationError): + check_section_refs(config, ["a.b.c", "f.g"]) From d53c84b6d6717375ee91d2847a3d0f24beafd8d1 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 22 Sep 2020 13:54:44 +0200 Subject: [PATCH 09/47] avoid None callback (#6100) --- spacy/pipeline/tok2vec.py | 2 +- spacy/tests/pipeline/test_tok2vec.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 721c67a19..9ab4e42b7 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -127,7 +127,7 @@ class Tok2Vec(Pipe): tokvecs = self.model.predict(docs) batch_id = Tok2VecListener.get_batch_id(docs) for listener in self.listeners: - listener.receive(batch_id, tokvecs, None) + listener.receive(batch_id, tokvecs, lambda dX: []) return tokvecs def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None: diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 2e514f490..6041657d3 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -169,3 +169,22 @@ def test_tok2vec_listener(): nlp.select_pipes(disable="tok2vec") assert nlp.pipe_names == ["tagger"] nlp("Running the pipeline with the Tok2Vec component disabled.") + + +def test_tok2vec_listener_callback(): + orig_config = Config().from_str(cfg_string) + nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + assert nlp.pipe_names == ["tok2vec", "tagger"] + tagger = nlp.get_pipe("tagger") + tok2vec = nlp.get_pipe("tok2vec") + nlp._link_components() + docs = [nlp.make_doc("A random sentence")] + tok2vec.model.initialize(X=docs) + gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs] + label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")] + tagger.model.initialize(X=docs, Y=label_sample) + docs = [nlp.make_doc("Another entirely random sentence")] + tok2vec.predict(docs) + Y, get_dX = tagger.model.begin_update(docs) + # assure that the backprop call works (and doesn't hit a 'None' callback) + assert get_dX(Y) is not None From e0e793be4d8146768e722c23d16cf7c5b170155e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 22 Sep 2020 21:53:06 +0200 Subject: [PATCH 10/47] fix KB IO (#6118) --- spacy/kb.pxd | 1 - spacy/kb.pyx | 47 ++++++++++++---------- spacy/tests/pipeline/test_entity_linker.py | 23 +++++++++++ 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 695693666..4a71b26a2 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -140,7 +140,6 @@ cdef class KnowledgeBase: self._entries.push_back(entry) self._aliases_table.push_back(alias) - cpdef from_disk(self, loc) cpdef set_entities(self, entity_list, freq_list, vector_list) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index b24ed3a20..ff5382c24 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -9,7 +9,8 @@ from libcpp.vector cimport vector from pathlib import Path import warnings -from os import path + +from spacy import util from .typedefs cimport hash_t from .errors import Errors, Warnings @@ -319,8 +320,14 @@ cdef class KnowledgeBase: return 0.0 - def to_disk(self, loc): - cdef Writer writer = Writer(loc) + def to_disk(self, path): + path = util.ensure_path(path) + if path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + if not path.parent.exists(): + path.parent.mkdir(parents=True) + + cdef Writer writer = Writer(path) writer.write_header(self.get_size_entities(), self.entity_vector_length) # dumping the entity vectors in their original order @@ -359,7 +366,13 @@ cdef class KnowledgeBase: writer.close() - cpdef from_disk(self, loc): + def from_disk(self, path): + path = util.ensure_path(path) + if path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + if not path.exists(): + raise ValueError(Errors.E929.format(loc=path)) + cdef hash_t entity_hash cdef hash_t alias_hash cdef int64_t entry_index @@ -369,7 +382,7 @@ cdef class KnowledgeBase: cdef AliasC alias cdef float vector_element - cdef Reader reader = Reader(loc) + cdef Reader reader = Reader(path) # STEP 0: load header and initialize KB cdef int64_t nr_entities @@ -450,16 +463,13 @@ cdef class KnowledgeBase: cdef class Writer: - def __init__(self, object loc): - if isinstance(loc, Path): - loc = bytes(loc) - if path.exists(loc): - if path.isdir(loc): - raise ValueError(Errors.E928.format(loc=loc)) - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + def __init__(self, path): + assert isinstance(path, Path) + content = bytes(path) + cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content self._fp = fopen(bytes_loc, 'wb') if not self._fp: - raise IOError(Errors.E146.format(path=loc)) + raise IOError(Errors.E146.format(path=path)) fseek(self._fp, 0, 0) def close(self): @@ -496,14 +506,9 @@ cdef class Writer: cdef class Reader: - def __init__(self, object loc): - if isinstance(loc, Path): - loc = bytes(loc) - if not path.exists(loc): - raise ValueError(Errors.E929.format(loc=loc)) - if path.isdir(loc): - raise ValueError(Errors.E928.format(loc=loc)) - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + def __init__(self, path): + content = bytes(path) + cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content self._fp = fopen(bytes_loc, 'rb') if not self._fp: PyErr_SetFromErrno(IOError) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index c43d2c58e..88e0646b3 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -144,6 +144,29 @@ def test_kb_empty(nlp): entity_linker.begin_training(lambda: []) +def test_kb_serialize(nlp): + """Test serialization of the KB""" + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + with make_tempdir() as d: + # normal read-write behaviour + mykb.to_disk(d / "kb") + mykb.from_disk(d / "kb") + mykb.to_disk(d / "kb.file") + mykb.from_disk(d / "kb.file") + mykb.to_disk(d / "new" / "kb") + mykb.from_disk(d / "new" / "kb") + # allow overwriting an existing file + mykb.to_disk(d / "kb.file") + with pytest.raises(ValueError): + # can not write to a directory + mykb.to_disk(d) + with pytest.raises(ValueError): + # can not read from a directory + mykb.from_disk(d) + with pytest.raises(ValueError): + # can not read from an unknown file + mykb.from_disk(d / "unknown" / "kb") + def test_candidate_generation(nlp): """Test correct candidate generation""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) From 86a08f819d192e50beff97e1b90c12f0daba2975 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 22 Sep 2020 21:54:52 +0200 Subject: [PATCH 11/47] tok2vec.update instead of predict (#6113) --- spacy/cli/debug_model.py | 2 +- spacy/tests/pipeline/test_tok2vec.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 1d27c7c52..7f8e1dabc 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -128,7 +128,7 @@ def debug_model( goldY = None for e in range(3): if tok2vec: - tok2vec.predict(X) + tok2vec.update([Example.from_dict(x, {}) for x in X]) Y, get_dX = model.begin_update(X) if goldY is None: goldY = _simulate_gold(Y) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 6041657d3..985314217 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -184,7 +184,7 @@ def test_tok2vec_listener_callback(): label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")] tagger.model.initialize(X=docs, Y=label_sample) docs = [nlp.make_doc("Another entirely random sentence")] - tok2vec.predict(docs) + tok2vec.update([Example.from_dict(x, {}) for x in docs]) Y, get_dX = tagger.model.begin_update(docs) # assure that the backprop call works (and doesn't hit a 'None' callback) assert get_dX(Y) is not None From 4a56ea72b545ea1162ae85d3b1ccc37f809182ec Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 09:15:07 +0200 Subject: [PATCH 12/47] fallbacks for old names --- spacy/training/iob_utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index 63deed3a5..03a502912 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -50,6 +50,10 @@ def _consume_ent(tags): return [start] + middle + [end] +def biluo_tags_from_doc(doc, missing="O"): + return doc_to_biluo_tags(doc, missing) + + def doc_to_biluo_tags(doc, missing="O"): return offsets_to_biluo_tags( doc, @@ -58,6 +62,10 @@ def doc_to_biluo_tags(doc, missing="O"): ) +def biluo_tags_from_offsets(doc, entities, missing="O"): + return offsets_to_biluo_tags(doc, entities, missing) + + def offsets_to_biluo_tags(doc, entities, missing="O"): """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO). @@ -143,6 +151,10 @@ def offsets_to_biluo_tags(doc, entities, missing="O"): return biluo +def spans_from_biluo_tags(doc, tags): + return biluo_tags_to_spans(doc, tags) + + def biluo_tags_to_spans(doc, tags): """Encode per-token tags following the BILUO scheme into Span object, e.g. to overwrite the doc.ents. @@ -161,6 +173,10 @@ def biluo_tags_to_spans(doc, tags): return spans +def offsets_from_biluo_tags(doc, tags): + return biluo_tags_to_offsets(doc, tags) + + def biluo_tags_to_offsets(doc, tags): """Encode per-token tags following the BILUO scheme into entity offsets. From 556f3e4652a33eb1465e1f886310653d8e3d2fd2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 09:24:28 +0200 Subject: [PATCH 13/47] add pooling to NEL's TransformerListener --- spacy/cli/templates/quickstart_training.jinja | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index a0d9f78ac..c55374899 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -107,6 +107,9 @@ nO = null [components.entity_linker.model.tok2vec] @architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 + +[components.entity_linker.model.tok2vec.pooling] +@layers = "reduce_mean.v1" {% endif -%} {% if "textcat" in components %} From f976bab710dae664501e6fecd7360053a080090e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 09:30:09 +0200 Subject: [PATCH 14/47] Remove empty file [ci skip] --- spacy/lang/cs/test_text.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 spacy/lang/cs/test_text.py diff --git a/spacy/lang/cs/test_text.py b/spacy/lang/cs/test_text.py deleted file mode 100644 index e69de29bb..000000000 From d8f661c9103b6b0a09de5b0e25428782d6736006 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 09:30:26 +0200 Subject: [PATCH 15/47] Update docs [ci skip] --- README.md | 4 +- website/meta/languages.json | 239 +++++++++++++++++------------------- 2 files changed, 113 insertions(+), 130 deletions(-) diff --git a/README.md b/README.md index d23051af0..61cefb69a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ be used in real products. spaCy comes with [pretrained pipelines](https://spacy.io/models) and vectors, and -currently supports tokenization for **59+ languages**. It features +currently supports tokenization for **60+ languages**. It features state-of-the-art speed, convolutional **neural network models** for tagging, parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the MIT license. @@ -69,7 +69,7 @@ it. ## Features -- Support for **59+ languages** +- Support for **60+ languages** - **Trained pipelines** - Multi-task learning with pretrained **transformers** like BERT - Pretrained **word vectors** diff --git a/website/meta/languages.json b/website/meta/languages.json index 493f96c49..5ef3a6469 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -1,21 +1,11 @@ { "languages": [ - { - "code": "zh", - "name": "Chinese", - "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], - "dependencies": [ - { - "name": "Jieba", - "url": "https://github.com/fxsjy/jieba" - }, - { - "name": "PKUSeg", - "url": "https://github.com/lancopku/PKUSeg-python" - } - ], - "has_examples": true - }, + { "code": "af", "name": "Afrikaans" }, + { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true }, + { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true }, + { "code": "bn", "name": "Bengali", "has_examples": true }, + { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, + { "code": "cs", "name": "Czech", "has_examples": true }, { "code": "da", "name": "Danish", @@ -23,39 +13,10 @@ "has_examples": true, "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"] }, - { - "code": "nl", - "name": "Dutch", - "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], - "example": "Dit is een zin.", - "has_examples": true - }, - { - "code": "en", - "name": "English", - "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"], - "starters": [ - "en_vectors_web_lg", - "en_trf_bertbaseuncased_lg", - "en_trf_robertabase_lg", - "en_trf_distilbertbaseuncased_lg", - "en_trf_xlnetbasecased_lg" - ], - "example": "This is a sentence.", - "has_examples": true - }, - { - "code": "fr", - "name": "French", - "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"], - "example": "C'est une phrase.", - "has_examples": true - }, { "code": "de", "name": "German", - "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"], - "starters": ["de_trf_bertbasecased_lg"], + "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"], "example": "Dies ist ein Satz.", "has_examples": true }, @@ -66,6 +27,46 @@ "example": "Αυτή είναι μια πρόταση.", "has_examples": true }, + { + "code": "en", + "name": "English", + "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"], + "starters": ["en_vectors_web_lg"], + "example": "This is a sentence.", + "has_examples": true + }, + { + "code": "es", + "name": "Spanish", + "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"], + "example": "Esto es una frase.", + "has_examples": true + }, + { "code": "et", "name": "Estonian" }, + { "code": "eu", "name": "Basque", "has_examples": true }, + { "code": "fa", "name": "Persian", "has_examples": true }, + { "code": "fi", "name": "Finnish", "has_examples": true }, + { + "code": "fr", + "name": "French", + "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"], + "example": "C'est une phrase.", + "has_examples": true + }, + { "code": "ga", "name": "Irish" }, + { "code": "gu", "name": "Gujarati", "has_examples": true }, + { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true }, + { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true }, + { "code": "hr", "name": "Croatian", "has_examples": true }, + { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, + { "code": "hy", "name": "Armenian", "has_examples": true }, + { + "code": "id", + "name": "Indonesian", + "example": "Ini adalah sebuah kalimat.", + "has_examples": true + }, + { "code": "is", "name": "Icelandic" }, { "code": "it", "name": "Italian", @@ -88,12 +89,37 @@ "example": "これは文章です。", "has_examples": true }, + { "code": "kn", "name": "Kannada", "has_examples": true }, + { + "code": "ko", + "name": "Korean", + "dependencies": [ + { + "name": "mecab-ko", + "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" + }, + { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" }, + { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" } + ], + "example": "이것은 문장입니다.", + "has_examples": true + }, + { "code": "lb", "name": "Luxembourgish", "has_examples": true }, + { + "code": "lij", + "name": "Ligurian", + "example": "Sta chì a l'é unna fraxe.", + "has_examples": true + }, { "code": "lt", "name": "Lithuanian", "has_examples": true, "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"] }, + { "code": "lv", "name": "Latvian" }, + { "code": "ml", "name": "Malayalam", "has_examples": true }, + { "code": "mr", "name": "Marathi" }, { "code": "nb", "name": "Norwegian Bokmål", @@ -101,6 +127,14 @@ "has_examples": true, "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"] }, + { "code": "ne", "name": "Nepali", "has_examples": true }, + { + "code": "nl", + "name": "Dutch", + "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], + "example": "Dit is een zin.", + "has_examples": true + }, { "code": "pl", "name": "Polish", @@ -122,69 +156,26 @@ "has_examples": true, "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"] }, - { - "code": "es", - "name": "Spanish", - "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"], - "example": "Esto es una frase.", - "has_examples": true - }, - { "code": "sv", "name": "Swedish", "has_examples": true }, - { "code": "fi", "name": "Finnish", "has_examples": true }, - { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, { "code": "ru", "name": "Russian", "has_examples": true, "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] }, - { - "code": "uk", - "name": "Ukrainian", - "has_examples": true, - "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] - }, - { "code": "hr", "name": "Croatian", "has_examples": true }, - { "code": "eu", "name": "Basque", "has_examples": true }, - { "code": "yo", "name": "Yoruba", "has_examples": true }, - { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, - { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, - { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true }, - { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true }, - { "code": "fa", "name": "Persian", "has_examples": true }, - { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true }, - { "code": "tt", "name": "Tatar", "has_examples": true }, - { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true }, + { "code": "sa", "name": "Sanskrit", "has_examples": true }, { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true }, - { "code": "ga", "name": "Irish" }, - { "code": "bn", "name": "Bengali", "has_examples": true }, - { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true }, - { "code": "mr", "name": "Marathi" }, - { "code": "kn", "name": "Kannada" }, - { "code": "ta", "name": "Tamil", "has_examples": true }, - { - "code": "id", - "name": "Indonesian", - "example": "Ini adalah sebuah kalimat.", - "has_examples": true - }, - { "code": "tl", "name": "Tagalog" }, - { "code": "af", "name": "Afrikaans" }, - { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true }, - { "code": "cs", "name": "Czech" }, - { "code": "is", "name": "Icelandic" }, - { "code": "lv", "name": "Latvian" }, - { "code": "sr", "name": "Serbian" }, - { "code": "sk", "name": "Slovak" }, + { "code": "sk", "name": "Slovak", "has_examples": true }, { "code": "sl", "name": "Slovenian" }, - { "code": "lb", "name": "Luxembourgish" }, { "code": "sq", "name": "Albanian", "example": "Kjo është një fjali.", "has_examples": true }, - { "code": "et", "name": "Estonian" }, + { "code": "sr", "name": "Serbian", "has_examples": true }, + { "code": "sv", "name": "Swedish", "has_examples": true }, + { "code": "ta", "name": "Tamil", "has_examples": true }, + { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true }, { "code": "th", "name": "Thai", @@ -194,51 +185,43 @@ "example": "นี่คือประโยค", "has_examples": true }, + { "code": "tl", "name": "Tagalog" }, + { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, + { "code": "tt", "name": "Tatar", "has_examples": true }, { - "code": "ko", - "name": "Korean", - "dependencies": [ - { - "name": "mecab-ko", - "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" - }, - { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" }, - { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" } - ], - "example": "이것은 문장입니다.", - "has_examples": true + "code": "uk", + "name": "Ukrainian", + "has_examples": true, + "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] }, + { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true }, { "code": "vi", "name": "Vietnamese", "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }] }, - { - "code": "lij", - "name": "Ligurian", - "example": "Sta chì a l'é unna fraxe.", - "has_examples": true - }, - { - "code": "hy", - "name": "Armenian", - "has_examples": true - }, - { - "code": "gu", - "name": "Gujarati", - "has_examples": true - }, - { - "code": "ml", - "name": "Malayalam", - "has_examples": true - }, { "code": "xx", "name": "Multi-language", "models": ["xx_ent_wiki_sm"], "example": "This is a sentence about Facebook." + }, + { "code": "yo", "name": "Yoruba", "has_examples": true }, + { + "code": "zh", + "name": "Chinese", + "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], + "dependencies": [ + { + "name": "Jieba", + "url": "https://github.com/fxsjy/jieba" + }, + { + "name": "PKUSeg", + "url": "https://github.com/lancopku/PKUSeg-python" + } + ], + "has_examples": true } ], "licenses": [ From 930b116f004bf4413851da6710712a77ae118dbb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 09:35:21 +0200 Subject: [PATCH 16/47] Update docs [ci skip] --- website/docs/usage/v3.md | 5 ++++- website/src/widgets/languages.js | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 406ba4b75..28bd02e3e 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -88,7 +88,10 @@ import Benchmarks from 'usage/\_benchmarks-models.md' - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel), [TransformerListener](/api/architectures#TransformerListener), [Tok2VecTransformer](/api/architectures#Tok2VecTransformer) -- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf) +- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf), + [`de_dep_news_trf`](/models/de#de_dep_news_trf), + [`es_dep_news_trf`](/models/es#es_dep_news_trf), + [`fr_dep_news_trf`](/models/fr#fr_dep_news_trf) - **Implementation:** [`spacy-transformers`](https://github.com/explosion/spacy-transformers) diff --git a/website/src/widgets/languages.js b/website/src/widgets/languages.js index bb26e57cd..74d850182 100644 --- a/website/src/widgets/languages.js +++ b/website/src/widgets/languages.js @@ -22,7 +22,7 @@ const Language = ({ name, code, models }) => ( {models && models.length ? ( - {models.length} {models.length === 1 ? 'model' : 'models'} + {models.length} {models.length === 1 ? 'package' : 'packages'} ) : ( none yet @@ -51,7 +51,7 @@ const Languages = () => ( Language Code Language Data - Models + Pipelines From 566d0487538c547dc40c14a80341c92a73378399 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 09:43:51 +0200 Subject: [PATCH 17/47] Fix project repo link [ci skip] --- website/src/widgets/project.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/src/widgets/project.js b/website/src/widgets/project.js index 8d309394d..9e23d60ea 100644 --- a/website/src/widgets/project.js +++ b/website/src/widgets/project.js @@ -16,7 +16,8 @@ export default function Project({ }) { const repoArg = repo ? ` --repo ${repo}` : '' const text = `${COMMAND} ${id}${repoArg}` - const url = `${repo || projectsRepo}/${id}` + const defaultRepo = `https://github.com/${projectsRepo}` + const url = `${repo || defaultRepo}/${id}` const header = ( <> {title}:{' '} From 61235445db66b66181d76d217c92d2501128f699 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 09:45:32 +0200 Subject: [PATCH 18/47] Update README.md [ci skip] --- README.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 61cefb69a..3e5e5febe 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ state-of-the-art speed, convolutional **neural network models** for tagging, parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the MIT license. -💫 **Version 2.3 out now!** +💫 **Version 3.0 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) @@ -29,16 +29,17 @@ spaCy is commercial open-source software, released under the MIT license. ## 📖 Documentation -| Documentation | | -| --------------- | -------------------------------------------------------------- | -| [spaCy 101] | New to spaCy? Here's everything you need to know! | -| [Usage Guides] | How to use spaCy and its features. | -| [New in v3.0] | New features, backwards incompatibilities and migration guide. | -| [API Reference] | The detailed reference for spaCy's API. | -| [Models] | Download statistical language models for spaCy. | -| [Universe] | Libraries, extensions, demos, books and courses. | -| [Changelog] | Changes and version history. | -| [Contribute] | How to contribute to the spaCy project and code base. | +| Documentation | | +| ------------------- | -------------------------------------------------------------- | +| [spaCy 101] | New to spaCy? Here's everything you need to know! | +| [Usage Guides] | How to use spaCy and its features. | +| [New in v3.0] | New features, backwards incompatibilities and migration guide. | +| [Project Templates] | End-to-end workflows you can clone, modify and run. | +| [API Reference] | The detailed reference for spaCy's API. | +| [Models] | Download statistical language models for spaCy. | +| [Universe] | Libraries, extensions, demos, books and courses. | +| [Changelog] | Changes and version history. | +| [Contribute] | How to contribute to the spaCy project and code base. | [spacy 101]: https://spacy.io/usage/spacy-101 [new in v3.0]: https://spacy.io/usage/v3 @@ -46,6 +47,7 @@ spaCy is commercial open-source software, released under the MIT license. [api reference]: https://spacy.io/api/ [models]: https://spacy.io/models [universe]: https://spacy.io/universe +[project templates]: https://github.com/explosion/projects [changelog]: https://spacy.io/usage#changelog [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md From 6ca06cb62cdbcddd1071fcc05871d675704c47a2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 10:14:27 +0200 Subject: [PATCH 19/47] Update docs and formatting [ci skip] --- spacy/cli/templates/quickstart_training.jinja | 2 +- website/docs/api/top-level.md | 19 ++++++++++ website/docs/usage/v3.md | 30 ++++++++------- website/src/components/infobox.js | 37 ++++++++++--------- 4 files changed, 56 insertions(+), 32 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index c55374899..7241c5116 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -327,7 +327,7 @@ sents_f = 0.0 ents_f = {{ (1.0 / components|length)|round(2) }} ents_p = 0.0 ents_r = 0.0 -{%- endif -%} +{%- endif %} {%- if "textcat" in components %} cats_score = {{ (1.0 / components|length)|round(2) }} {%- endif -%} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 2c082ae0b..f36be0806 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -632,6 +632,12 @@ the beginning of a multi-token entity, `I` the inside of an entity of three or more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a single-token entity. + + +This method was previously available as `spacy.gold.biluo_tags_from_offsets`. + + + > #### Example > > ```python @@ -647,6 +653,7 @@ single-token entity. | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~ | | `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ | +| `missing` | The label used for missing values, e.g. if tokenization doesn't align with the entity offsets. Defaults to `"O"`. ~~str~~ | | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ | ### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"} @@ -654,6 +661,12 @@ single-token entity. Encode per-token tags following the [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets. + + +This method was previously available as `spacy.gold.offsets_from_biluo_tags`. + + + > #### Example > > ```python @@ -678,6 +691,12 @@ Encode per-token tags following the [`Span`](/api/span) objects. This can be used to create entity spans from token-based tags, e.g. to overwrite the `doc.ents`. + + +This method was previously available as `spacy.gold.spans_from_biluo_tags`. + + + > #### Example > > ```python diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 88935e720..91d97cae2 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -551,17 +551,19 @@ Note that spaCy v3.0 now requires **Python 3.6+**. ### Removed or renamed API {#incompat-removed} -| Removed | Replacement | -| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | -| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | -| `GoldParse` | [`Example`](/api/example) | -| `GoldCorpus` | [`Corpus`](/api/corpus) | -| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | -| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed | -| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) | -| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | -| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | -| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated | +| Removed | Replacement | +| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | +| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) | +| `GoldParse` | [`Example`](/api/example) | +| `GoldCorpus` | [`Corpus`](/api/corpus) | +| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | +| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed | +| `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) | +| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) | +| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | +| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | +| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated | The following deprecated methods, attributes and arguments were removed in v3.0. Most of them have been **deprecated for a while** and many would previously @@ -971,9 +973,9 @@ python -m spacy package ./output ./packages #### Data utilities and gold module {#migrating-gold} -The `spacy.gold` module has been renamed to `spacy.training` and the conversion -utilities now follow the naming format of `x_to_y`. This mostly -affects internals, but if you've been using the span offset conversion utilities +The `spacy.gold` module has been renamed to `spacy.training` and the conversion +utilities now follow the naming format of `x_to_y`. This mostly affects +internals, but if you've been using the span offset conversion utilities [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags), [`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or [`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js index 968b6cea8..b5a7af545 100644 --- a/website/src/components/infobox.js +++ b/website/src/components/infobox.js @@ -20,24 +20,27 @@ export default function Infobox({ [classes.danger]: variant === 'danger', }) return ( - + ) } From ae5dacf75f490c1b64257235cc2e4c93306d226e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 10:14:34 +0200 Subject: [PATCH 20/47] Tidy up and add types --- spacy/training/iob_utils.py | 54 +++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index 03a502912..91fc40205 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -1,9 +1,11 @@ +from typing import List, Tuple, Iterable, Union, Iterator import warnings + from ..errors import Errors, Warnings -from ..tokens import Span +from ..tokens import Span, Doc -def iob_to_biluo(tags): +def iob_to_biluo(tags: Iterable[str]) -> List[str]: out = [] tags = list(tags) while tags: @@ -12,7 +14,7 @@ def iob_to_biluo(tags): return out -def biluo_to_iob(tags): +def biluo_to_iob(tags: Iterable[str]) -> List[str]: out = [] for tag in tags: if tag is None: @@ -23,12 +25,12 @@ def biluo_to_iob(tags): return out -def _consume_os(tags): +def _consume_os(tags: List[str]) -> Iterator[str]: while tags and tags[0] == "O": yield tags.pop(0) -def _consume_ent(tags): +def _consume_ent(tags: List[str]) -> List[str]: if not tags: return [] tag = tags.pop(0) @@ -50,11 +52,7 @@ def _consume_ent(tags): return [start] + middle + [end] -def biluo_tags_from_doc(doc, missing="O"): - return doc_to_biluo_tags(doc, missing) - - -def doc_to_biluo_tags(doc, missing="O"): +def doc_to_biluo_tags(doc: Doc, missing: str = "O"): return offsets_to_biluo_tags( doc, [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], @@ -62,11 +60,9 @@ def doc_to_biluo_tags(doc, missing="O"): ) -def biluo_tags_from_offsets(doc, entities, missing="O"): - return offsets_to_biluo_tags(doc, entities, missing) - - -def offsets_to_biluo_tags(doc, entities, missing="O"): +def offsets_to_biluo_tags( + doc: Doc, entities: Iterable[Tuple[int, int, Union[str, int]]], missing: str = "O" +) -> List[str]: """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO). @@ -77,7 +73,7 @@ def offsets_to_biluo_tags(doc, entities, missing="O"): the original string. RETURNS (list): A list of unicode strings, describing the tags. Each tag string will be of the form either "", "O" or "{action}-{label}", where - action is one of "B", "I", "L", "U". The string "-" is used where the + action is one of "B", "I", "L", "U". The missing label is used where the entity offsets don't align with the tokenization in the `Doc` object. The training algorithm will view these as missing values. "O" denotes a non-entity token. "B" denotes the beginning of a multi-token entity, @@ -93,7 +89,6 @@ def offsets_to_biluo_tags(doc, entities, missing="O"): """ # Ensure no overlapping entity labels exist tokens_in_ents = {} - starts = {token.idx: token.i for token in doc} ends = {token.idx + len(token): token.i for token in doc} biluo = ["-" for _ in doc] @@ -117,7 +112,6 @@ def offsets_to_biluo_tags(doc, entities, missing="O"): ) ) tokens_in_ents[token_index] = (start_char, end_char, label) - start_token = starts.get(start_char) end_token = ends.get(end_char) # Only interested if the tokenization is correct @@ -151,11 +145,7 @@ def offsets_to_biluo_tags(doc, entities, missing="O"): return biluo -def spans_from_biluo_tags(doc, tags): - return biluo_tags_to_spans(doc, tags) - - -def biluo_tags_to_spans(doc, tags): +def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]: """Encode per-token tags following the BILUO scheme into Span object, e.g. to overwrite the doc.ents. @@ -173,11 +163,9 @@ def biluo_tags_to_spans(doc, tags): return spans -def offsets_from_biluo_tags(doc, tags): - return biluo_tags_to_offsets(doc, tags) - - -def biluo_tags_to_offsets(doc, tags): +def biluo_tags_to_offsets( + doc: Doc, tags: Iterable[str] +) -> List[Tuple[int, int, Union[str, int]]]: """Encode per-token tags following the BILUO scheme into entity offsets. doc (Doc): The document that the BILUO tags refer to. @@ -192,8 +180,8 @@ def biluo_tags_to_offsets(doc, tags): return [(span.start_char, span.end_char, span.label_) for span in spans] -def tags_to_entities(tags): - """ Note that the end index returned by this function is inclusive. +def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]: + """Note that the end index returned by this function is inclusive. To use it for Span creation, increment the end by 1.""" entities = [] start = None @@ -225,3 +213,9 @@ def tags_to_entities(tags): else: raise ValueError(Errors.E068.format(tag=tag)) return entities + + +# Fallbacks to make backwards-compat easier +offsets_from_biluo_tags = biluo_tags_to_offsets +spans_from_biluo_tags = biluo_tags_to_spans +biluo_tags_from_offsets = offsets_to_biluo_tags From 20b0ec5dcf5b97a3c406ec6bd7aa3f32223c63fa Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 10:37:12 +0200 Subject: [PATCH 21/47] avoid logging performance of frozen components --- spacy/cli/train.py | 6 ++++-- spacy/training/loggers.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index bf3749c9e..811a3ba86 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -152,7 +152,8 @@ def train( exclude=frozen_components, ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") - print_row, finalize_logger = train_logger(nlp) + with nlp.select_pipes(disable=[*frozen_components]): + print_row, finalize_logger = train_logger(nlp) try: progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) @@ -163,7 +164,8 @@ def train( progress.close() print_row(info) if is_best_checkpoint and output_path is not None: - update_meta(T_cfg, nlp, info) + with nlp.select_pipes(disable=[*frozen_components]): + update_meta(T_cfg, nlp, info) with nlp.use_params(optimizer.averages): nlp.to_disk(output_path / "model-best") progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 92b598033..dddf20169 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -11,9 +11,11 @@ def console_logger(): def setup_printer( nlp: "Language", ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: + # we assume here that only components are enabled that should be trained & logged + logged_pipes = nlp.pipe_names score_cols = list(nlp.config["training"]["score_weights"]) score_widths = [max(len(col), 6) for col in score_cols] - loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] + loss_cols = [f"Loss {pipe}" for pipe in logged_pipes] loss_widths = [max(len(col), 8) for col in loss_cols] table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] table_header = [col.upper() for col in table_header] @@ -26,7 +28,7 @@ def console_logger(): try: losses = [ "{0:.2f}".format(float(info["losses"][pipe_name])) - for pipe_name in nlp.pipe_names + for pipe_name in logged_pipes ] except KeyError as e: raise KeyError( From 6435458d517e1ca689d2bcf6f996df59218957bf Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 12:12:38 +0200 Subject: [PATCH 22/47] simplify expression --- spacy/cli/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 811a3ba86..2900ef379 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -152,7 +152,7 @@ def train( exclude=frozen_components, ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") - with nlp.select_pipes(disable=[*frozen_components]): + with nlp.select_pipes(disable=frozen_components): print_row, finalize_logger = train_logger(nlp) try: @@ -164,7 +164,7 @@ def train( progress.close() print_row(info) if is_best_checkpoint and output_path is not None: - with nlp.select_pipes(disable=[*frozen_components]): + with nlp.select_pipes(disable=frozen_components): update_meta(T_cfg, nlp, info) with nlp.use_params(optimizer.averages): nlp.to_disk(output_path / "model-best") From 02b69dd0d532fb4c8835868332268e2f6eead511 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 12:56:54 +0200 Subject: [PATCH 23/47] Update models directory [ci skip] --- website/src/templates/models.js | 108 +++++++++++++------------------- 1 file changed, 44 insertions(+), 64 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 5061972b8..5d705048b 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -12,7 +12,6 @@ import Tag from '../components/tag' import { H2, Label } from '../components/typography' import Icon from '../components/icon' import Link from '../components/link' -import Grid from '../components/grid' import Infobox from '../components/infobox' import Accordion from '../components/accordion' import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' @@ -31,10 +30,16 @@ const MODEL_META = { wiki: 'Wikipedia', uas: 'Unlabelled dependencies', las: 'Labelled dependencies', + token_acc: 'Tokenization', + tok: 'Tokenization', tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', - ents_f: 'Entities (F-score)', - ents_p: 'Entities (precision)', - ents_r: 'Entities (recall)', + tag: 'Part-of-speech tags (fine grained tags, Token.tag)', + ents_f: 'Named entities (F-score)', + ents_p: 'Named entities (precision)', + ents_r: 'Named entities (recall)', + sent_f: 'Sentence segmentation (F-score)', + sent_p: 'Sentence segmentation (precision)', + sent_r: 'Sentence segmentation (recall)', cpu: 'words per second on CPU', gpu: 'words per second on GPU', pipeline: 'Active processing pipeline components in order', @@ -83,25 +88,19 @@ function formatVectors(data) { } function formatAccuracy(data) { - if (!data) return null - const labels = { - las: 'LAS', - uas: 'UAS', - tags_acc: 'TAG', - ents_f: 'NER F', - ents_p: 'NER P', - ents_r: 'NER R', - } - const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) - const isNer = key => key.startsWith('ents_') + if (!data) return [] return Object.keys(data) - .filter(key => labels[key]) - .map(key => ({ - label: labels[key], - value: data[key].toFixed(2), - help: MODEL_META[key], - type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, - })) + .map(label => { + const value = data[label] + return isNaN(value) + ? null + : { + label, + value: value.toFixed(2), + help: MODEL_META[label], + } + }) + .filter(item => item) } function formatModelMeta(data) { @@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl { label: 'Author', content: author }, { label: 'License', content: license }, ] - const accuracy = [ - { - label: 'Syntax Accuracy', - items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null, - }, - { - label: 'NER Accuracy', - items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null, - }, - ] const error = ( @@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl

) - return (

- - {accuracy && - accuracy.map(({ label, items }, i) => - !items ? null : ( - - - - - - - - {items.map((item, i) => ( - - - - - ))} - -
{label}
- - {item.value}
- ) - )} -
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)} {hasInteractiveCode && ( @@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl `import spacy`, `from spacy.lang.${langId}.examples import sentences `, ``, - `nlp = spacy.load('${name}')`, + `nlp = spacy.load("${name}")`, `doc = nlp(sentences[0])`, `print(doc.text)`, `for token in doc:`, @@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl ].join('\n')} )} + {meta.accuracy && ( + + + + {meta.accuracy.map(({ label, value, help }) => ( + + + + + + ))} + +
+ {label.toUpperCase()} + {help} + {value} +
+
+ )} {labels && (

@@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl const labelNames = labels[pipe] || [] const help = LABEL_SCHEME_META[pipe] return ( - + }

- + ) } From 7745d77a38a131f6ffec9b4ae43da8ef799c228e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 13:21:42 +0200 Subject: [PATCH 25/47] Fix whitespace in template [ci skip] --- spacy/cli/templates/quickstart_training.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 7241c5116..53fd99ee8 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -280,7 +280,7 @@ vectors = "{{ word_vectors }}" {% endif -%} {% if use_transformer -%} accumulate_gradient = {{ transformer["size_factor"] }} -{% endif %} +{% endif -%} dev_corpus = "corpora.dev" train_corpus = "corpora.train" From 6c85fab3167a468953b23b25d4a25a7fbdb478cd Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 13:35:09 +0200 Subject: [PATCH 26/47] state_type and extra_state_tokens instead of nr_feature_tokens --- spacy/cli/templates/quickstart_training.jinja | 12 ++++--- spacy/ml/models/parser.py | 31 +++++++++---------- spacy/pipeline/dep_parser.pyx | 3 +- spacy/pipeline/ner.pyx | 3 +- .../tests/serialize/test_serialize_config.py | 9 ++++-- website/docs/api/architectures.md | 22 +++++++------ website/docs/usage/embeddings-transformers.md | 3 +- 7 files changed, 48 insertions(+), 35 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 7241c5116..9dde2237b 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -59,7 +59,8 @@ factory = "parser" [components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 8 +state_type = "deps" +extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 use_upper = false @@ -79,7 +80,8 @@ factory = "ner" [components.ner.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 3 +state_type = "ner" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 use_upper = false @@ -183,7 +185,8 @@ factory = "parser" [components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 8 +state_type = "deps" +extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 use_upper = true @@ -200,7 +203,8 @@ factory = "ner" [components.ner.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 6 +state_type = "ner" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 use_upper = true diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 868f9d6d2..0e10932d5 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -11,7 +11,8 @@ from ...tokens import Doc @registry.architectures.register("spacy.TransitionBasedParser.v1") def build_tb_parser_model( tok2vec: Model[List[Doc], List[Floats2d]], - nr_feature_tokens: int, + state_type: str, + extra_state_tokens: bool, hidden_width: int, maxout_pieces: int, use_upper: bool = True, @@ -40,20 +41,12 @@ def build_tb_parser_model( tok2vec (Model[List[Doc], List[Floats2d]]): Subnetwork to map tokens into vector representations. - nr_feature_tokens (int): The number of tokens in the context to use to - construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The - 2, 8 and 13 feature sets are designed for the parser, while the 3 and 6 - feature sets are designed for the NER. The recommended feature sets are - 3 for NER, and 8 for the dependency parser. - - TODO: This feature should be split into two, state_type: ["deps", "ner"] - and extra_state_features: [True, False]. This would map into: - - (deps, False): 8 - (deps, True): 13 - (ner, False): 3 - (ner, True): 6 - + state_type (str): + String value denoting the type of parser model: "deps" or "ner" + extra_state_tokens (bool): Whether or not to use additional tokens in the context + to construct the state vector. Defaults to `False`, which means 3 and 8 + for the NER and parser respectively. When set to `True`, this would become 6 + feature sets (for the NER) or 13 (for the parser). hidden_width (int): The width of the hidden layer. maxout_pieces (int): How many pieces to use in the state prediction layer. Recommended values are 1, 2 or 3. If 1, the maxout non-linearity @@ -68,8 +61,14 @@ def build_tb_parser_model( Usually inferred from data at the beginning of training, or loaded from disk. """ + if state_type == "deps": + nr_feature_tokens = 13 if extra_state_tokens else 8 + elif state_type == "ner": + nr_feature_tokens = 6 if extra_state_tokens else 3 + else: + raise ValueError(f"unknown state type {state_type}") # TODO error t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) + tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( nO=hidden_width if use_upper else nO, diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index edd791e40..7d8c63815 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -15,7 +15,8 @@ from ..training import validate_examples default_model_config = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 8 +state_type = "deps" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 2fa5c6392..fc4f03473 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -13,7 +13,8 @@ from ..training import validate_examples default_model_config = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 6 +state_type = "ner" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 1e17b3212..abfd4d725 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -67,7 +67,8 @@ width = ${components.tok2vec.model.width} parser_config_string = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 99 +state_type = "deps" +extra_state_tokens = false hidden_width = 66 maxout_pieces = 2 @@ -95,7 +96,11 @@ def my_parser(): MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), ) parser = build_tb_parser_model( - tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 + tok2vec=tok2vec, + state_type="deps", + extra_state_tokens=True, + hidden_width=65, + maxout_pieces=5, ) return parser diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 30d863b17..0d283d805 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -414,7 +414,8 @@ one component. > ```ini > [model] > @architectures = "spacy.TransitionBasedParser.v1" -> nr_feature_tokens = 6 +> state_type = "ner" +> extra_state_tokens = false > hidden_width = 64 > maxout_pieces = 2 > @@ -446,15 +447,16 @@ consists of either two or three subnetworks: state representation. If not present, the output from the lower model is used as action scores directly. -| Name | Description | -| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ | -| `hidden_width` | The width of the hidden layer. ~~int~~ | -| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | -| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | -| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | +| Name | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| `state_type` | Which task to extract features for. Possible values are "ner" and "dependencies". ~~str~~ | +| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ | +| `hidden_width` | The width of the hidden layer. ~~int~~ | +| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | +| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | +| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index a855d703c..d61172a5b 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -448,7 +448,8 @@ factory = "ner" [nlp.pipeline.ner.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 3 +state_type = "ner" +extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 use_upper = false From e4e7f5b00d46b0a6f75e419c509fbd0c73927121 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 15:44:40 +0200 Subject: [PATCH 27/47] Update docs [ci skip] --- website/docs/usage/_benchmarks-models.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 33163f306..028746db0 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison. | System | POS | UAS | LAS | | ------------------------------------------------------------------------------ | ---: | ---: | ---: | -| spaCy RoBERTa (2020) | | | | +| spaCy RoBERTa (2020) | 97.8 | 96.6 | 94.7 | | spaCy CNN (2020) | | | | | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 | | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 | @@ -37,7 +37,8 @@ on training Stanza on this corpus to allow direct comparison. **Accuracy on the Penn Treebank.** See [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more -results. +results. For spaCy's evaluation, see the +[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank). From 76bbed3466519d384834715f48f240140c43e02e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 16:00:03 +0200 Subject: [PATCH 28/47] Use Literal type for nr_feature_tokens --- requirements.txt | 1 + setup.cfg | 1 + spacy/compat.py | 5 +++++ spacy/ml/models/parser.py | 3 ++- spacy/tests/serialize/test_serialize_config.py | 14 ++++++++++++-- 5 files changed, 21 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4d6c1dfd0..a8b237aa1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ pytokenizations setuptools packaging importlib_metadata>=0.20; python_version < "3.8" +typing_extensions>=3.7.4; python_version < "3.8" # Development dependencies cython>=0.25 pytest>=4.6.5 diff --git a/setup.cfg b/setup.cfg index dd0975800..9831402d1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,7 @@ install_requires = setuptools packaging importlib_metadata>=0.20; python_version < "3.8" + typing_extensions>=3.7.4; python_version < "3.8" [options.entry_points] console_scripts = diff --git a/spacy/compat.py b/spacy/compat.py index 2d51ff0ae..6eca18b80 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -22,6 +22,11 @@ try: except ImportError: cupy = None +try: # Python 3.8+ + from typing import Literal +except ImportError: + from typing_extensions import Literal # noqa: F401 + from thinc.api import Optimizer # noqa: F401 pickle = pickle diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 868f9d6d2..68cc20e9b 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -2,6 +2,7 @@ from typing import Optional, List from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from thinc.types import Floats2d +from ...compat import Literal from ...util import registry from .._precomputable_affine import PrecomputableAffine from ..tb_framework import TransitionModel @@ -11,7 +12,7 @@ from ...tokens import Doc @registry.architectures.register("spacy.TransitionBasedParser.v1") def build_tb_parser_model( tok2vec: Model[List[Doc], List[Floats2d]], - nr_feature_tokens: int, + nr_feature_tokens: Literal[3, 6, 8, 13], hidden_width: int, maxout_pieces: int, use_upper: bool = True, diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 1e17b3212..5f25cbfe1 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -67,7 +67,7 @@ width = ${components.tok2vec.model.width} parser_config_string = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 99 +nr_feature_tokens = 3 hidden_width = 66 maxout_pieces = 2 @@ -95,7 +95,7 @@ def my_parser(): MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), ) parser = build_tb_parser_model( - tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 + tok2vec=tok2vec, nr_feature_tokens=8, hidden_width=65, maxout_pieces=5 ) return parser @@ -340,3 +340,13 @@ def test_config_auto_fill_extra_fields(): assert "extra" not in nlp.config["training"] # Make sure the config generated is valid load_model_from_config(nlp.config) + + +def test_config_validate_literal(): + nlp = English() + config = Config().from_str(parser_config_string) + config["model"]["nr_feature_tokens"] = 666 + with pytest.raises(ConfigValidationError): + nlp.add_pipe("parser", config=config) + config["model"]["nr_feature_tokens"] = 13 + nlp.add_pipe("parser", config=config) From 50a4425cdaed350653368c9c350f95717e9414d9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 16:03:32 +0200 Subject: [PATCH 29/47] Adjust docs --- spacy/ml/models/parser.py | 4 ++-- website/docs/api/architectures.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 68cc20e9b..5d091c590 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -42,8 +42,8 @@ def build_tb_parser_model( tok2vec (Model[List[Doc], List[Floats2d]]): Subnetwork to map tokens into vector representations. nr_feature_tokens (int): The number of tokens in the context to use to - construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The - 2, 8 and 13 feature sets are designed for the parser, while the 3 and 6 + construct the state vector. Valid choices are 3, 6, 8 and 13. The + 8 and 13 feature sets are designed for the parser, while the 3 and 6 feature sets are designed for the NER. The recommended feature sets are 3 for NER, and 8 for the dependency parser. diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 30d863b17..8797b2f31 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -449,7 +449,7 @@ consists of either two or three subnetworks: | Name | Description | | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ | +| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `3`, `6`, `8` and `13`. The `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ | | `hidden_width` | The width of the hidden layer. ~~int~~ | | `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | | `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | From dd2292793f3bbd7cdfd2cf42bad205ec7428016a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 16:53:49 +0200 Subject: [PATCH 30/47] 'parser' instead of 'deps' for state_type --- spacy/cli/templates/quickstart_training.jinja | 4 ++-- spacy/ml/models/parser.py | 4 ++-- spacy/pipeline/dep_parser.pyx | 2 +- spacy/tests/serialize/test_serialize_config.py | 4 ++-- website/docs/api/architectures.md | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 9dde2237b..bc7e206f5 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -59,7 +59,7 @@ factory = "parser" [components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" -state_type = "deps" +state_type = "parser" extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 @@ -185,7 +185,7 @@ factory = "parser" [components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" -state_type = "deps" +state_type = "parser" extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 0e10932d5..b6e4b8d8a 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -42,7 +42,7 @@ def build_tb_parser_model( tok2vec (Model[List[Doc], List[Floats2d]]): Subnetwork to map tokens into vector representations. state_type (str): - String value denoting the type of parser model: "deps" or "ner" + String value denoting the type of parser model: "parser" or "ner" extra_state_tokens (bool): Whether or not to use additional tokens in the context to construct the state vector. Defaults to `False`, which means 3 and 8 for the NER and parser respectively. When set to `True`, this would become 6 @@ -61,7 +61,7 @@ def build_tb_parser_model( Usually inferred from data at the beginning of training, or loaded from disk. """ - if state_type == "deps": + if state_type == "parser": nr_feature_tokens = 13 if extra_state_tokens else 8 elif state_type == "ner": nr_feature_tokens = 6 if extra_state_tokens else 3 diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 7d8c63815..a49475c8e 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -15,7 +15,7 @@ from ..training import validate_examples default_model_config = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -state_type = "deps" +state_type = "parser" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index abfd4d725..10e0e132b 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -67,7 +67,7 @@ width = ${components.tok2vec.model.width} parser_config_string = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -state_type = "deps" +state_type = "parser" extra_state_tokens = false hidden_width = 66 maxout_pieces = 2 @@ -97,7 +97,7 @@ def my_parser(): ) parser = build_tb_parser_model( tok2vec=tok2vec, - state_type="deps", + state_type="parser", extra_state_tokens=True, hidden_width=65, maxout_pieces=5, diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 0d283d805..ef2666ec0 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -450,7 +450,7 @@ consists of either two or three subnetworks: | Name | Description | | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| `state_type` | Which task to extract features for. Possible values are "ner" and "dependencies". ~~str~~ | +| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ | | `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ | | `hidden_width` | The width of the hidden layer. ~~int~~ | | `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | From 3c3863654e2804223a30c8ed3cae3d2e73147ca6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 16:54:43 +0200 Subject: [PATCH 31/47] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index b57bbeda2..b0cdd562c 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a21" +__version__ = "3.0.0a22" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 25b34bba9406a3185406e79e8b0e45048e7f3914 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 16:57:14 +0200 Subject: [PATCH 32/47] throw custom error when state_type is invalid --- spacy/errors.py | 2 ++ spacy/ml/models/parser.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index 153f8da0c..47a134c1f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -480,6 +480,8 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E917 = ("Received invalid value {value} for 'state_type' in " + "TransitionBasedParser: only 'parser' or 'ner' are valid options.") E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid " "values are an instance of spacy.vocab.Vocab or True to create one" " (default).") diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index b6e4b8d8a..dbea6b507 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -2,6 +2,7 @@ from typing import Optional, List from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from thinc.types import Floats2d +from ... import Errors from ...util import registry from .._precomputable_affine import PrecomputableAffine from ..tb_framework import TransitionModel @@ -66,7 +67,7 @@ def build_tb_parser_model( elif state_type == "ner": nr_feature_tokens = 6 if extra_state_tokens else 3 else: - raise ValueError(f"unknown state type {state_type}") # TODO error + raise ValueError(Errors.E917.format(value=state_type)) t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) tok2vec.set_dim("nO", hidden_width) From 5a9fdbc8ad8e6e03968b78e026b8ee75e4c4a3e1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 17:32:14 +0200 Subject: [PATCH 33/47] state_type as Literal --- spacy/ml/models/parser.py | 5 +++-- spacy/tests/serialize/test_serialize_config.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index dbea6b507..2c40bb3ab 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -2,7 +2,8 @@ from typing import Optional, List from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from thinc.types import Floats2d -from ... import Errors +from ...errors import Errors +from ...compat import Literal from ...util import registry from .._precomputable_affine import PrecomputableAffine from ..tb_framework import TransitionModel @@ -12,7 +13,7 @@ from ...tokens import Doc @registry.architectures.register("spacy.TransitionBasedParser.v1") def build_tb_parser_model( tok2vec: Model[List[Doc], List[Floats2d]], - state_type: str, + state_type: Literal["parser", "ner"], extra_state_tokens: bool, hidden_width: int, maxout_pieces: int, diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 10e0e132b..6aad59272 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -345,3 +345,13 @@ def test_config_auto_fill_extra_fields(): assert "extra" not in nlp.config["training"] # Make sure the config generated is valid load_model_from_config(nlp.config) + + +def test_config_validate_literal(): + nlp = English() + config = Config().from_str(parser_config_string) + config["model"]["state_type"] = "nonsense" + with pytest.raises(ConfigValidationError): + nlp.add_pipe("parser", config=config) + config["model"]["state_type"] = "ner" + nlp.add_pipe("parser", config=config) \ No newline at end of file From b816ace4bbd158524865b7e995da8fa23ee0bc2b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 17:33:13 +0200 Subject: [PATCH 34/47] format --- spacy/tests/serialize/test_serialize_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 6aad59272..ec7544456 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -354,4 +354,4 @@ def test_config_validate_literal(): with pytest.raises(ConfigValidationError): nlp.add_pipe("parser", config=config) config["model"]["state_type"] = "ner" - nlp.add_pipe("parser", config=config) \ No newline at end of file + nlp.add_pipe("parser", config=config) From 3f77eb749c411f78dc21135deb446ad8d5fde76c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 19:50:15 +0200 Subject: [PATCH 35/47] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index b0cdd562c..8d019897b 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a22" +__version__ = "3.0.0a23" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From f25f05c503c83949c9831028e221f3d024358889 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 20:03:04 +0200 Subject: [PATCH 36/47] Adjust sort order [ci skip] --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 93000ea27..025fe5288 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", # Default order of sections in the config.cfg. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. # fmt: off -CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"] +CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"] # fmt: on From c8bda92243b7752ad88be46e071368376704fb2b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 20:05:02 +0200 Subject: [PATCH 37/47] Update benchmarks [ci skip] --- website/docs/usage/_benchmarks-models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 028746db0..c5ce95e2f 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison. | System | POS | UAS | LAS | | ------------------------------------------------------------------------------ | ---: | ---: | ---: | -| spaCy RoBERTa (2020) | 97.8 | 96.6 | 94.7 | +| spaCy RoBERTa (2020) | 98.0 | 96.8 | 95.0 | | spaCy CNN (2020) | | | | | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 | | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 | From 02008e9a55ea0d4a3ac41cb2324d89c9f837abcd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 23 Sep 2020 22:02:31 +0200 Subject: [PATCH 38/47] Update docs [ci skip] --- website/docs/usage/_benchmarks-models.md | 40 +++++++++++------------- website/docs/usage/facts-figures.md | 19 +++++++++++ website/src/widgets/landing.js | 2 +- 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index c5ce95e2f..1fe6e2bff 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -4,21 +4,16 @@ import { Help } from 'components/typography'; import Link from 'components/link'
-| System | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | -| ------------------------------------------------------------------------- | ----------------: | ----------------: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | -| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | -| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | -| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | | -| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | _n/a_2 | _n/a_2 | 88.8 | 234 | 2k | -| Flair | - | 97.9 | 89.3 | | | +| System | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | +| ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | +| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | +| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | +| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
**Accuracy and speed on the -[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**
**1. ** -[Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). **2. ** _Coming soon_: -Qi et al. don't report parsing and tagging results on OntoNotes. We're working -on training Stanza on this corpus to allow direct comparison. +[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**
@@ -26,19 +21,22 @@ on training Stanza on this corpus to allow direct comparison.
-| System | POS | UAS | LAS | -| ------------------------------------------------------------------------------ | ---: | ---: | ---: | -| spaCy RoBERTa (2020) | 98.0 | 96.8 | 95.0 | -| spaCy CNN (2020) | | | | -| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 | -| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 | +| Named Entity Recognition Model | OntoNotes | CoNLL '03 | +| ------------------------------------------------------------------------------ | --------: | --------- | +| spaCy RoBERTa (2020) | +| spaCy CNN (2020) | | +| spaCy CNN (2017) | 86.4 | +| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | 88.8 | +| Flair2 | 89.7 |
-**Accuracy on the Penn Treebank.** See -[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more -results. For spaCy's evaluation, see the -[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank). +**Named entity recognition accuracy** on the +[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) and +[CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See +[NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for +more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). +**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/)
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index 75f92070a..ad6776b2c 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -61,6 +61,25 @@ import Benchmarks from 'usage/\_benchmarks-models.md' +
+ +| System | UAS | LAS | +| ------------------------------------------------------------------------------ | ---: | ---: | +| spaCy RoBERTa (2020) | 96.8 | 95.0 | +| spaCy CNN (2020) | 93.7 | 91.8 | +| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 | +| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.2 | 95.7 | + +
+ +**Accuracy on the Penn Treebank.** See +[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more +results. + +
+ +
+ The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 2e75c893a..6fe7f4cdf 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -297,7 +297,7 @@ const Landing = ({ data }) => { to run.

- +

From e2ffe51fb5c18b18397930d976fe323f75d02863 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 10:13:41 +0200 Subject: [PATCH 39/47] Update docs [ci skip] --- website/docs/usage/_benchmarks-models.md | 4 ++-- website/docs/usage/facts-figures.md | 16 +++++----------- website/docs/usage/projects.md | 2 +- website/gatsby-config.js | 1 + 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 1fe6e2bff..a00229867 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -12,8 +12,8 @@ import { Help } from 'components/typography'; import Link from 'components/link'
-**Accuracy and speed on the -[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.** +**Full pipeline accuracy and speed** on the +[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index ad6776b2c..743dae74d 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -65,28 +65,22 @@ import Benchmarks from 'usage/\_benchmarks-models.md' | System | UAS | LAS | | ------------------------------------------------------------------------------ | ---: | ---: | -| spaCy RoBERTa (2020) | 96.8 | 95.0 | -| spaCy CNN (2020) | 93.7 | 91.8 | +| spaCy RoBERTa (2020)1 | 96.8 | 95.0 | +| spaCy CNN (2020)1 | 93.7 | 91.8 | | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 | | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.2 | 95.7 |
-**Accuracy on the Penn Treebank.** See +**Dependency parsing accuracy** on the Penn Treebank. See [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more -results. +results. **1. ** Project template: +[`benchmarks/parsing_penn_treebank`](%%GITHUB_PROJECTS/benchmarks/parsing_penn_treebank).
- - -The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone -our project template. - - - diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 95e20525a..8e093e8d6 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -213,7 +213,7 @@ a quick web demo. It looks pretty similar to a config file used to define CI pipelines. ```yaml -https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml +%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml ``` | Section | Description | diff --git a/website/gatsby-config.js b/website/gatsby-config.js index 5e3b5b537..c1a2f9ab9 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -24,6 +24,7 @@ const branch = isNightly ? 'develop' : 'master' // Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY const replacements = { GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`, + GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`, } /** From ae51f580c1cd8a4168253d326fd9c1356fc88844 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 10:27:33 +0200 Subject: [PATCH 40/47] Fix handling of score_weights --- spacy/cli/templates/quickstart_training.jinja | 18 --------- spacy/cli/train.py | 5 ++- spacy/lang/bn/__init__.py | 1 - spacy/lang/el/__init__.py | 1 - spacy/lang/en/__init__.py | 1 - spacy/lang/fa/__init__.py | 1 - spacy/lang/fr/__init__.py | 1 - spacy/lang/nb/__init__.py | 1 - spacy/lang/nl/__init__.py | 1 - spacy/lang/pl/__init__.py | 1 - spacy/lang/ru/__init__.py | 1 - spacy/lang/sv/__init__.py | 1 - spacy/lang/uk/__init__.py | 1 - spacy/language.py | 20 ++++++---- spacy/pipeline/dep_parser.pyx | 10 ++++- spacy/pipeline/entityruler.py | 8 +++- spacy/pipeline/lemmatizer.py | 1 - spacy/pipeline/morphologizer.pyx | 3 +- spacy/pipeline/ner.pyx | 3 +- spacy/pipeline/sentencizer.pyx | 1 - spacy/pipeline/senter.pyx | 1 - spacy/pipeline/tagger.pyx | 1 - spacy/pipeline/textcat.py | 23 ++++++----- spacy/schemas.py | 2 +- spacy/tests/pipeline/test_pipe_factories.py | 23 ++++++++--- spacy/util.py | 11 ++++++ website/docs/api/language.md | 39 +++++++++---------- website/docs/usage/training.md | 7 ++-- 28 files changed, 95 insertions(+), 92 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index a0ffa8f52..9a8b9d1d7 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -317,21 +317,3 @@ start = 100 stop = 1000 compound = 1.001 {% endif %} - -[training.score_weights] -{%- if "tagger" in components %} -tag_acc = {{ (1.0 / components|length)|round(2) }} -{%- endif -%} -{%- if "parser" in components %} -dep_uas = 0.0 -dep_las = {{ (1.0 / components|length)|round(2) }} -sents_f = 0.0 -{%- endif %} -{%- if "ner" in components %} -ents_f = {{ (1.0 / components|length)|round(2) }} -ents_p = 0.0 -ents_r = 0.0 -{%- endif %} -{%- if "textcat" in components %} -cats_score = {{ (1.0 / components|length)|round(2) }} -{%- endif -%} diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 2900ef379..3485a4ff2 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -209,6 +209,8 @@ def create_train_batches(iterator, batcher, max_epochs: int): def create_evaluation_callback( nlp: Language, dev_corpus: Callable, weights: Dict[str, float] ) -> Callable[[], Tuple[float, Dict[str, float]]]: + weights = {key: value for key, value in weights.items() if value is not None} + def evaluate() -> Tuple[float, Dict[str, float]]: dev_examples = list(dev_corpus(nlp)) scores = nlp.evaluate(dev_examples) @@ -368,7 +370,8 @@ def update_meta( ) -> None: nlp.meta["performance"] = {} for metric in training["score_weights"]: - nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) + if metric is not None: + nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) for pipe_name in nlp.pipe_names: nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 270185a4b..923e29a17 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -25,7 +25,6 @@ class Bengali(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 0c5e0672b..1a7b19914 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -30,7 +30,6 @@ class Greek(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 1a595b6e7..bf7e9987f 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -29,7 +29,6 @@ class English(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 244534120..f3a6635dc 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -28,7 +28,6 @@ class Persian(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 42241cd8a..72e641d1f 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -33,7 +33,6 @@ class French(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 28a2f0bf2..9672dfd6e 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -28,7 +28,6 @@ class Norwegian(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 1526e41f5..15b6b9de2 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -30,7 +30,6 @@ class Dutch(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 7ddad9893..573dbc6f9 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -35,7 +35,6 @@ class Polish(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "pos_lookup", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index be770e3ec..4a296dd23 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -25,7 +25,6 @@ class Russian(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "pymorphy2", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 6db74cd39..ea314f487 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -31,7 +31,6 @@ class Swedish(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index e9936cf7d..006a1cf7f 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -25,7 +25,6 @@ class Ukrainian(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "pymorphy2", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/language.py b/spacy/language.py index 4dffd9679..0b7deacad 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -248,9 +248,15 @@ class Language: self._config["nlp"]["pipeline"] = list(self.component_names) self._config["nlp"]["disabled"] = list(self.disabled) self._config["components"] = pipeline - if not self._config["training"].get("score_weights"): - combined_score_weights = combine_score_weights(score_weights) - self._config["training"]["score_weights"] = combined_score_weights + # We're merging the existing score weights back into the combined + # weights to make sure we're preserving custom settings in the config + # but also reflect updates (e.g. new components added) + prev_score_weights = self._config["training"].get("score_weights", {}) + combined_score_weights = combine_score_weights(score_weights) + combined_score_weights.update(prev_score_weights) + # Combine the scores a second time to normalize them + combined_score_weights = combine_score_weights([combined_score_weights]) + self._config["training"]["score_weights"] = combined_score_weights if not srsly.is_json_serializable(self._config): raise ValueError(Errors.E961.format(config=self._config)) return self._config @@ -412,7 +418,6 @@ class Language: assigns: Iterable[str] = SimpleFrozenList(), requires: Iterable[str] = SimpleFrozenList(), retokenizes: bool = False, - scores: Iterable[str] = SimpleFrozenList(), default_score_weights: Dict[str, float] = SimpleFrozenDict(), func: Optional[Callable] = None, ) -> Callable: @@ -430,12 +435,11 @@ class Language: e.g. "token.ent_id". Used for pipeline analyis. retokenizes (bool): Whether the component changes the tokenization. Used for pipeline analysis. - scores (Iterable[str]): All scores set by the component if it's trainable, - e.g. ["ents_f", "ents_r", "ents_p"]. default_score_weights (Dict[str, float]): The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to 1.0 per component and - will be combined and normalized for the whole pipeline. + will be combined and normalized for the whole pipeline. If None, + the score won't be shown in the logs or be weighted. func (Optional[Callable]): Factory function if not used as a decorator. DOCS: https://nightly.spacy.io/api/language#factory @@ -475,7 +479,7 @@ class Language: default_config=default_config, assigns=validate_attrs(assigns), requires=validate_attrs(requires), - scores=scores, + scores=list(default_score_weights.keys()), default_score_weights=default_score_weights, retokenizes=retokenizes, ) diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index a49475c8e..a447434d2 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -43,8 +43,14 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, }, - scores=["dep_uas", "dep_las", "dep_las_per_type", "sents_p", "sents_r", "sents_f"], - default_score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0}, + default_score_weights={ + "dep_uas": 0.5, + "dep_las": 0.5, + "dep_las_per_type": None, + "sents_p": None, + "sents_r": None, + "sents_f": 0.0, + }, ) def make_parser( nlp: Language, diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 24bbb067f..9166a69b8 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -25,8 +25,12 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] "overwrite_ents": False, "ent_id_sep": DEFAULT_ENT_ID_SEP, }, - scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, + default_score_weights={ + "ents_f": 1.0, + "ents_p": 0.0, + "ents_r": 0.0, + "ents_per_type": None, + }, ) def make_entity_ruler( nlp: Language, diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 0fd3482c4..c30d09f62 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -21,7 +21,6 @@ from .. import util "lookups": None, "overwrite": False, }, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 62ad9e0eb..5fee9a900 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -49,8 +49,7 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] "morphologizer", assigns=["token.morph", "token.pos"], default_config={"model": DEFAULT_MORPH_MODEL}, - scores=["pos_acc", "morph_acc", "morph_per_feat"], - default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5}, + default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( nlp: Language, diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index fc4f03473..c9b0a5031 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -39,8 +39,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "update_with_oracle_cut_size": 100, "model": DEFAULT_NER_MODEL, }, - scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, + default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, ) def make_ner( diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 5700c2b98..2882f6f8b 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -15,7 +15,6 @@ from .. import util "sentencizer", assigns=["token.is_sent_start", "doc.sents"], default_config={"punct_chars": None}, - scores=["sents_p", "sents_r", "sents_f"], default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) def make_sentencizer( diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index a7eb721fd..da85a9cf2 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -36,7 +36,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] "senter", assigns=["token.is_sent_start"], default_config={"model": DEFAULT_SENTER_MODEL}, - scores=["sents_p", "sents_r", "sents_f"], default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) def make_senter(nlp: Language, name: str, model: Model): diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 0d78047ae..3efe29916 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -42,7 +42,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] "tagger", assigns=["token.tag"], default_config={"model": DEFAULT_TAGGER_MODEL}, - scores=["tag_acc"], default_score_weights={"tag_acc": 1.0}, ) def make_tagger(nlp: Language, name: str, model: Model): diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index e7cb62a0d..6b8c0ca65 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -62,18 +62,17 @@ subword_features = true "positive_label": None, "model": DEFAULT_TEXTCAT_MODEL, }, - scores=[ - "cats_score", - "cats_score_desc", - "cats_p", - "cats_r", - "cats_f", - "cats_macro_f", - "cats_macro_auc", - "cats_f_per_type", - "cats_macro_auc_per_type", - ], - default_score_weights={"cats_score": 1.0}, + default_score_weights={ + "cats_score": 1.0, + "cats_score_desc": None, + "cats_p": None, + "cats_r": None, + "cats_f": None, + "cats_macro_f": None, + "cats_macro_auc": None, + "cats_f_per_type": None, + "cats_macro_auc_per_type": None, + }, ) def make_textcat( nlp: Language, diff --git a/spacy/schemas.py b/spacy/schemas.py index b0f26dcd7..e34841008 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -211,7 +211,7 @@ class ConfigSchemaTraining(BaseModel): seed: Optional[StrictInt] = Field(..., title="Random seed") gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") - score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model") + score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") optimizer: Optimizer = Field(..., title="The optimizer to use") diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 881460704..4ab1c4248 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -359,12 +359,8 @@ def test_language_factories_scores(): func = lambda nlp, name: lambda doc: doc weights1 = {"a1": 0.5, "a2": 0.5} weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} - Language.factory( - f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func, - ) - Language.factory( - f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func, - ) + Language.factory(f"{name}1", default_score_weights=weights1, func=func) + Language.factory(f"{name}2", default_score_weights=weights2, func=func) meta1 = Language.get_factory_meta(f"{name}1") assert meta1.default_score_weights == weights1 meta2 = Language.get_factory_meta(f"{name}2") @@ -376,6 +372,21 @@ def test_language_factories_scores(): cfg = nlp.config["training"] expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} assert cfg["score_weights"] == expected_weights + # Test with custom defaults + config = nlp.config.copy() + config["training"]["score_weights"]["a1"] = 0.0 + config["training"]["score_weights"]["b3"] = 1.0 + nlp = English.from_config(config) + score_weights = nlp.config["training"]["score_weights"] + expected = {"a1": 0.0, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.59} + assert score_weights == expected + # Test with null values + config = nlp.config.copy() + config["training"]["score_weights"]["a1"] = None + nlp = English.from_config(config) + score_weights = nlp.config["training"]["score_weights"] + expected = {"a1": None, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.58} # rounding :( + assert score_weights == expected def test_pipe_factories_from_source(): diff --git a/spacy/util.py b/spacy/util.py index 025fe5288..f7c5cff59 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1209,8 +1209,19 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]: weights (List[dict]): The weights defined by the components. RETURNS (Dict[str, float]): The combined and normalized weights. """ + # We first need to extract all None/null values for score weights that + # shouldn't be shown in the table *or* be weighted result = {} + all_weights = [] for w_dict in weights: + filtered_weights = {} + for key, value in w_dict.items(): + if value is None: + result[key] = None + else: + filtered_weights[key] = value + all_weights.append(filtered_weights) + for w_dict in all_weights: # We need to account for weights that don't sum to 1.0 and normalize # the score weights accordingly, then divide score by the number of # components. diff --git a/website/docs/api/language.md b/website/docs/api/language.md index a7b9c0d88..dd3cc57dd 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -145,17 +145,16 @@ examples, see the > ) > ``` -| Name | Description | -| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | The name of the component factory. ~~str~~ | -| _keyword-only_ | | -| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | -| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | -| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ | -| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | +| Name | Description | +| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | The name of the component factory. ~~str~~ | +| _keyword-only_ | | +| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | +| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | +| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ | +| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | ## Language.\_\_call\_\_ {#call tag="method"} @@ -1036,12 +1035,12 @@ provided by the [`@Language.component`](/api/language#component) or component is defined and stored on the `Language` class for each component instance and factory instance. -| Name | Description | -| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `factory` | The name of the registered component factory. ~~str~~ | -| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | -| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~  | -| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~  | -| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ | +| Name | Description | +| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `factory` | The name of the registered component factory. ~~str~~ | +| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | +| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~  | +| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~  | +| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ | +| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Based on the `default_score_weights` and used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index b63145636..65afd0eb4 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -470,6 +470,7 @@ score. ```ini [training.score_weights] dep_las = 0.4 +dep_uas = null ents_f = 0.4 tag_acc = 0.2 token_acc = 0.0 @@ -481,9 +482,9 @@ you generate a config for a given pipeline, the score weights are generated by combining and normalizing the default score weights of the pipeline components. The default score weights are defined by each pipeline component via the `default_score_weights` setting on the -[`@Language.component`](/api/language#component) or -[`@Language.factory`](/api/language#factory). By default, all pipeline -components are weighted equally. +[`@Language.factory`](/api/language#factory) decorator. By default, all pipeline +components are weighted equally. If a score weight is set to `null`, it will be +excluded from the logs and the score won't be weighted. From 17a6b0a1731321380914d3638e7e3bc25fd23a28 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 24 Sep 2020 10:30:42 +0200 Subject: [PATCH 41/47] Make project pull order insensitive (#6131) --- spacy/cli/project/pull.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index edcd410bd..3119d3a12 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -27,19 +27,32 @@ def project_pull_cli( def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): + # TODO: We don't have tests for this :(. It would take a bit of mockery to + # set up. I guess see if it breaks first? config = load_project_config(project_dir) if remote in config.get("remotes", {}): remote = config["remotes"][remote] storage = RemoteStorage(project_dir, remote) - for cmd in config.get("commands", []): - deps = [project_dir / dep for dep in cmd.get("deps", [])] - if any(not dep.exists() for dep in deps): - continue - cmd_hash = get_command_hash("", "", deps, cmd["script"]) - for output_path in cmd.get("outputs", []): - url = storage.pull(output_path, command_hash=cmd_hash) - yield url, output_path + commands = list(config.get("commands", [])) + # We use a while loop here because we don't know how the commands + # will be ordered. A command might need dependencies from one that's later + # in the list. + while commands: + for i, cmd in enumerate(list(commands)): + deps = [project_dir / dep for dep in cmd.get("deps", [])] + if all(dep.exists() for dep in deps): + cmd_hash = get_command_hash("", "", deps, cmd["script"]) + for output_path in cmd.get("outputs", []): + url = storage.pull(output_path, command_hash=cmd_hash) + yield url, output_path - out_locs = [project_dir / out for out in cmd.get("outputs", [])] - if all(loc.exists() for loc in out_locs): - update_lockfile(project_dir, cmd) + out_locs = [project_dir / out for out in cmd.get("outputs", [])] + if all(loc.exists() for loc in out_locs): + update_lockfile(project_dir, cmd) + # We remove the command from the list here, and break, so that + # we iterate over the loop again. + commands.remove(i) + break + else: + # If we didn't break the for loop, break the while loop. + break From c645c4e7ceddbd819b7a56e56f013bb8447dea4b Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 24 Sep 2020 10:31:17 +0200 Subject: [PATCH 42/47] fix micro PRF for textcat (#6130) * fix micro PRF for textcat * small fix --- spacy/scorer.py | 8 ++++---- spacy/tests/pipeline/test_textcat.py | 29 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index da22d59d4..c50de3d43 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -240,7 +240,7 @@ class Scorer: pred_per_feat[field].add((gold_i, feat)) for field in per_feat: per_feat[field].score_set( - pred_per_feat.get(field, set()), gold_per_feat.get(field, set()), + pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) ) result = {k: v.to_dict() for k, v in per_feat.items()} return {f"{attr}_per_feat": result} @@ -418,9 +418,9 @@ class Scorer: f_per_type[pred_label].fp += 1 micro_prf = PRFScore() for label_prf in f_per_type.values(): - micro_prf.tp = label_prf.tp - micro_prf.fn = label_prf.fn - micro_prf.fp = label_prf.fp + micro_prf.tp += label_prf.tp + micro_prf.fn += label_prf.fn + micro_prf.fp += label_prf.fp n_cats = len(f_per_type) + 1e-100 macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 99b5132ca..232b53e1d 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -8,6 +8,7 @@ from spacy.language import Language from spacy.pipeline import TextCategorizer from spacy.tokens import Doc from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL +from spacy.scorer import Scorer from ..util import make_tempdir from ...cli.train import verify_textcat_config @@ -224,3 +225,31 @@ def test_positive_class_not_binary(): assert textcat.labels == ("SOME", "THING", "POS") with pytest.raises(ValueError): verify_textcat_config(nlp, pipe_config) + +def test_textcat_evaluation(): + train_examples = [] + nlp = English() + ref1 = nlp("one") + ref1.cats = {"winter": 1.0, "summer": 1.0, "spring": 1.0, "autumn": 1.0} + pred1 = nlp("one") + pred1.cats = {"winter": 1.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0} + train_examples.append(Example(pred1, ref1)) + + ref2 = nlp("two") + ref2.cats = {"winter": 0.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0} + pred2 = nlp("two") + pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0} + train_examples.append(Example(pred2, ref2)) + + scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]) + assert scores["cats_f_per_type"]["winter"]["p"] == 1/2 + assert scores["cats_f_per_type"]["winter"]["r"] == 1/1 + assert scores["cats_f_per_type"]["summer"]["p"] == 0 + assert scores["cats_f_per_type"]["summer"]["r"] == 0/1 + assert scores["cats_f_per_type"]["spring"]["p"] == 1/1 + assert scores["cats_f_per_type"]["spring"]["r"] == 1/2 + assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2 + assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2 + + assert scores["cats_micro_p"] == 4/5 + assert scores["cats_micro_r"] == 4/6 From 4bbe41f017ffc6334a35f2a682804cf6365dfd9e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 10:42:47 +0200 Subject: [PATCH 43/47] Fix combined scores and update test --- spacy/language.py | 7 ++----- spacy/tests/pipeline/test_pipe_factories.py | 4 ++-- spacy/util.py | 10 ++++++++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 0b7deacad..a52391419 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -251,11 +251,8 @@ class Language: # We're merging the existing score weights back into the combined # weights to make sure we're preserving custom settings in the config # but also reflect updates (e.g. new components added) - prev_score_weights = self._config["training"].get("score_weights", {}) - combined_score_weights = combine_score_weights(score_weights) - combined_score_weights.update(prev_score_weights) - # Combine the scores a second time to normalize them - combined_score_weights = combine_score_weights([combined_score_weights]) + prev_weights = self._config["training"].get("score_weights", {}) + combined_score_weights = combine_score_weights(score_weights, prev_weights) self._config["training"]["score_weights"] = combined_score_weights if not srsly.is_json_serializable(self._config): raise ValueError(Errors.E961.format(config=self._config)) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 4ab1c4248..4c197005e 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -378,14 +378,14 @@ def test_language_factories_scores(): config["training"]["score_weights"]["b3"] = 1.0 nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] - expected = {"a1": 0.0, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.59} + expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34} assert score_weights == expected # Test with null values config = nlp.config.copy() config["training"]["score_weights"]["a1"] = None nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] - expected = {"a1": None, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.58} # rounding :( + expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35} assert score_weights == expected diff --git a/spacy/util.py b/spacy/util.py index f7c5cff59..709da8d29 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1202,11 +1202,16 @@ def get_arg_names(func: Callable) -> List[str]: return list(set([*argspec.args, *argspec.kwonlyargs])) -def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]: +def combine_score_weights( + weights: List[Dict[str, float]], + overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(), +) -> Dict[str, float]: """Combine and normalize score weights defined by components, e.g. {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}. weights (List[dict]): The weights defined by the components. + overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that + should be preserved. RETURNS (Dict[str, float]): The combined and normalized weights. """ # We first need to extract all None/null values for score weights that @@ -1216,6 +1221,7 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]: for w_dict in weights: filtered_weights = {} for key, value in w_dict.items(): + value = overrides.get(key, value) if value is None: result[key] = None else: @@ -1227,7 +1233,7 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]: # components. total = sum(w_dict.values()) for key, value in w_dict.items(): - weight = round(value / total / len(weights), 2) + weight = round(value / total / len(all_weights), 2) result[key] = result.get(key, 0.0) + weight return result From 4eb39b5c43c74f8eabc1b2a8fa3b68e8baa02d3a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 11:04:35 +0200 Subject: [PATCH 44/47] Fix logging --- spacy/errors.py | 1 + spacy/training/loggers.py | 16 +++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 47a134c1f..ee2091225 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -480,6 +480,7 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})") E917 = ("Received invalid value {value} for 'state_type' in " "TransitionBasedParser: only 'parser' or 'ner' are valid options.") E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid " diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index dddf20169..d35b5a4bd 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -13,7 +13,8 @@ def console_logger(): ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: # we assume here that only components are enabled that should be trained & logged logged_pipes = nlp.pipe_names - score_cols = list(nlp.config["training"]["score_weights"]) + score_weights = nlp.config["training"]["score_weights"] + score_cols = [col for col, value in score_weights.items() if value is not None] score_widths = [max(len(col), 6) for col in score_cols] loss_cols = [f"Loss {pipe}" for pipe in logged_pipes] loss_widths = [max(len(col), 8) for col in loss_cols] @@ -40,10 +41,15 @@ def console_logger(): ) from None scores = [] for col in score_cols: - score = float(info["other_scores"].get(col, 0.0)) - if col != "speed": - score *= 100 - scores.append("{0:.2f}".format(score)) + score = info["other_scores"].get(col, 0.0) + try: + score = float(score) + if col != "speed": + score *= 100 + scores.append("{0:.2f}".format(score)) + except TypeError: + err = Errors.E916.format(name=col, score_type=type(score)) + raise TypeError(err) from None data = ( [info["epoch"], info["step"]] + losses From f69fea8b252ac5f28c4daac40046df507ab6f07f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 11:29:07 +0200 Subject: [PATCH 45/47] Improve error handling around non-number scores --- spacy/cli/train.py | 7 ++++++- spacy/errors.py | 4 ++++ spacy/training/loggers.py | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3485a4ff2..eabc82be0 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -214,7 +214,12 @@ def create_evaluation_callback( def evaluate() -> Tuple[float, Dict[str, float]]: dev_examples = list(dev_corpus(nlp)) scores = nlp.evaluate(dev_examples) - # Calculate a weighted sum based on score_weights for the main score + # Calculate a weighted sum based on score_weights for the main score. + # We can only consider scores that are ints/floats, not dicts like + # entity scores per type etc. + for key, value in scores.items(): + if key in weights and not isinstance(value, (int, float)): + raise ValueError(Errors.E915.format(name=key, score_type=type(value))) try: weighted_score = sum( scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights diff --git a/spacy/errors.py b/spacy/errors.py index ee2091225..dce5cf51c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -480,6 +480,10 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E915 = ("Can't use score '{name}' to calculate final weighted score. Expected " + "float or int but got: {score_type}. To exclude the score from the " + "final score, set its weight to null in the [training.score_weights] " + "section of your training config.") E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})") E917 = ("Received invalid value {value} for 'state_type' in " "TransitionBasedParser: only 'parser' or 'ner' are valid options.") diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index d35b5a4bd..0f054d433 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -49,7 +49,7 @@ def console_logger(): scores.append("{0:.2f}".format(score)) except TypeError: err = Errors.E916.format(name=col, score_type=type(score)) - raise TypeError(err) from None + raise ValueError(err) from None data = ( [info["epoch"], info["step"]] + losses From d7ab6a2ffe8e11ee644286ea815bae8cf59bfabb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 12:37:21 +0200 Subject: [PATCH 46/47] Update docs [ci skip] --- website/docs/usage/_benchmarks-models.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index a00229867..4b25418b5 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -22,12 +22,13 @@ import { Help } from 'components/typography'; import Link from 'components/link'
| Named Entity Recognition Model | OntoNotes | CoNLL '03 | -| ------------------------------------------------------------------------------ | --------: | --------- | -| spaCy RoBERTa (2020) | -| spaCy CNN (2020) | | -| spaCy CNN (2017) | 86.4 | -| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | 88.8 | -| Flair2 | 89.7 | +| ------------------------------------------------------------------------------ | --------: | --------: | +| spaCy RoBERTa (2020) | | 92.2 | +| spaCy CNN (2020) | | 88.4 | +| spaCy CNN (2017) | 86.4 | | +| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | 88.8 | 92.1 | +| Flair2 | 89.7 | 93.1 | +| BERT Base3 | - | 92.4 |
@@ -36,7 +37,8 @@ import { Help } from 'components/typography'; import Link from 'components/link' [CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See [NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). -**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/) +**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/). **3. +** [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805).
From 6836b664330926a401d05f16fe95cf475febff08 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 13:41:25 +0200 Subject: [PATCH 47/47] Update docs and resolve todos [ci skip] --- website/docs/usage/_benchmarks-models.md | 8 ++++---- website/docs/usage/embeddings-transformers.md | 2 -- website/docs/usage/facts-figures.md | 2 +- website/docs/usage/linguistic-features.md | 9 ++++++--- website/docs/usage/processing-pipelines.md | 7 +++++-- website/docs/usage/projects.md | 5 ++++- 6 files changed, 20 insertions(+), 13 deletions(-) diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 4b25418b5..5b193d3a4 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -1,10 +1,10 @@ import { Help } from 'components/typography'; import Link from 'components/link' - +
-| System | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | +| Pipeline | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | | [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | @@ -21,10 +21,10 @@ import { Help } from 'components/typography'; import Link from 'components/link'
-| Named Entity Recognition Model | OntoNotes | CoNLL '03 | +| Named Entity Recognition System | OntoNotes | CoNLL '03 | | ------------------------------------------------------------------------------ | --------: | --------: | | spaCy RoBERTa (2020) | | 92.2 | -| spaCy CNN (2020) | | 88.4 | +| spaCy CNN (2020) | 85.3 | 88.4 | | spaCy CNN (2017) | 86.4 | | | [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | 88.8 | 92.1 | | Flair2 | 89.7 | 93.1 | diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index d61172a5b..b00760e62 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -235,8 +235,6 @@ The `Transformer` component sets the [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, which lets you access the transformers outputs at runtime. - - ```cli $ python -m spacy download en_core_trf_lg ``` diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index 743dae74d..a31559b04 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -63,7 +63,7 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
-| System | UAS | LAS | +| Dependency Parsing System | UAS | LAS | | ------------------------------------------------------------------------------ | ---: | ---: | | spaCy RoBERTa (2020)1 | 96.8 | 95.0 | | spaCy CNN (2020)1 | 93.7 | 91.8 | diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 914e18acb..d9a894398 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1654,9 +1654,12 @@ The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical component that only provides sentence boundaries. Along with being faster and smaller than the parser, its primary advantage is that it's easier to train because it only requires annotated sentence boundaries rather than full -dependency parses. - - +dependency parses. spaCy's [trained pipelines](/models) include both a parser +and a trained sentence segmenter, which is +[disabled](/usage/processing-pipelines#disabling) by default. If you only need +sentence boundaries and no parser, you can use the `enable` and `disable` +arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and +disable the parser. > #### senter vs. parser > diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 97806dc2a..dbf0881ac 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -253,8 +253,6 @@ different mechanisms you can use: Disabled and excluded component names can be provided to [`spacy.load`](/api/top-level#spacy.load) as a list. - - > #### 💡 Optional pipeline components > > The `disable` mechanism makes it easy to distribute pipeline packages with @@ -262,6 +260,11 @@ Disabled and excluded component names can be provided to > your pipeline may include a statistical _and_ a rule-based component for > sentence segmentation, and you can choose which one to run depending on your > use case. +> +> For example, spaCy's [trained pipelines](/models) like +> [`en_core_web_sm`](/models/en#en_core_web_sm) contain both a `parser` and +> `senter` that perform sentence segmentation, but the `senter` is disabled by +> default. ```python # Load the pipeline without the entity recognizer diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 8e093e8d6..6d5746308 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -733,7 +733,10 @@ workflows, but only one can be tracked by DVC. The Prodigy integration will require a nightly version of Prodigy that supports -spaCy v3+. +spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by +exporting your data with +[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running +[`spacy convert`](/api/cli#convert) to convert it to the binary format.