diff --git a/README.md b/README.md index d23051af0..3e5e5febe 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,12 @@ be used in real products. spaCy comes with [pretrained pipelines](https://spacy.io/models) and vectors, and -currently supports tokenization for **59+ languages**. It features +currently supports tokenization for **60+ languages**. It features state-of-the-art speed, convolutional **neural network models** for tagging, parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the MIT license. -💫 **Version 2.3 out now!** +💫 **Version 3.0 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) @@ -29,16 +29,17 @@ spaCy is commercial open-source software, released under the MIT license. ## 📖 Documentation -| Documentation | | -| --------------- | -------------------------------------------------------------- | -| [spaCy 101] | New to spaCy? Here's everything you need to know! | -| [Usage Guides] | How to use spaCy and its features. | -| [New in v3.0] | New features, backwards incompatibilities and migration guide. | -| [API Reference] | The detailed reference for spaCy's API. | -| [Models] | Download statistical language models for spaCy. | -| [Universe] | Libraries, extensions, demos, books and courses. | -| [Changelog] | Changes and version history. | -| [Contribute] | How to contribute to the spaCy project and code base. | +| Documentation | | +| ------------------- | -------------------------------------------------------------- | +| [spaCy 101] | New to spaCy? Here's everything you need to know! | +| [Usage Guides] | How to use spaCy and its features. | +| [New in v3.0] | New features, backwards incompatibilities and migration guide. | +| [Project Templates] | End-to-end workflows you can clone, modify and run. | +| [API Reference] | The detailed reference for spaCy's API. | +| [Models] | Download statistical language models for spaCy. | +| [Universe] | Libraries, extensions, demos, books and courses. | +| [Changelog] | Changes and version history. | +| [Contribute] | How to contribute to the spaCy project and code base. | [spacy 101]: https://spacy.io/usage/spacy-101 [new in v3.0]: https://spacy.io/usage/v3 @@ -46,6 +47,7 @@ spaCy is commercial open-source software, released under the MIT license. [api reference]: https://spacy.io/api/ [models]: https://spacy.io/models [universe]: https://spacy.io/universe +[project templates]: https://github.com/explosion/projects [changelog]: https://spacy.io/usage#changelog [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md @@ -69,7 +71,7 @@ it. ## Features -- Support for **59+ languages** +- Support for **60+ languages** - **Trained pipelines** - Multi-task learning with pretrained **transformers** like BERT - Pretrained **word vectors** diff --git a/requirements.txt b/requirements.txt index 4d6c1dfd0..a8b237aa1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ pytokenizations setuptools packaging importlib_metadata>=0.20; python_version < "3.8" +typing_extensions>=3.7.4; python_version < "3.8" # Development dependencies cython>=0.25 pytest>=4.6.5 diff --git a/setup.cfg b/setup.cfg index dd0975800..9831402d1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,7 @@ install_requires = setuptools packaging importlib_metadata>=0.20; python_version < "3.8" + typing_extensions>=3.7.4; python_version < "3.8" [options.entry_points] console_scripts = diff --git a/spacy/about.py b/spacy/about.py index ec3c168a5..8d019897b 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a20" +__version__ = "3.0.0a23" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 797a701b9..21a4e54ce 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -378,7 +378,7 @@ def git_sparse_checkout(repo, subpath, dest, branch): # Looking for this 'rev-list' command in the git --help? Hah. cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" ret = run_command(cmd, capture=True) - git_repo = _from_http_to_git(repo) + git_repo = _http_to_git(repo) # Now pass those missings into another bit of git internals missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) if not missings: @@ -414,7 +414,7 @@ def get_git_version( return (int(version[0]), int(version[1])) -def _from_http_to_git(repo: str) -> str: +def _http_to_git(repo: str) -> str: if repo.startswith("http://"): repo = repo.replace(r"http://", r"https://") if repo.startswith(r"https://"): diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index ad89b9976..8f8234c61 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -9,7 +9,7 @@ import sys from ._util import app, Arg, Opt from ..training import docs_to_json from ..tokens import DocBin -from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs +from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs # Converters are matched by file extension except for ner/iob, which are @@ -18,12 +18,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do # imported from /converters. CONVERTERS = { - "conllubio": conllu2docs, - "conllu": conllu2docs, - "conll": conllu2docs, - "ner": conll_ner2docs, - "iob": iob2docs, - "json": json2docs, + "conllubio": conllu_to_docs, + "conllu": conllu_to_docs, + "conll": conllu_to_docs, + "ner": conll_ner_to_docs, + "iob": iob_to_docs, + "json": json_to_docs, } diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 7930d0674..d07a0bb2d 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List from pathlib import Path from wasabi import msg, table from thinc.api import Config -from thinc.config import VARIABLE_RE +from thinc.config import VARIABLE_RE, ConfigValidationError import typer from ._util import Arg, Opt, show_validation_error, parse_config_overrides @@ -51,7 +51,10 @@ def debug_config( msg.divider("Config validation") with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) - nlp, _ = util.load_model_from_config(config) + nlp, resolved = util.load_model_from_config(config) + # Use the resolved config here in case user has one function returning + # a dict of corpora etc. + check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"]) msg.good("Config is valid") if show_vars: variables = get_variables(config) @@ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]: value = util.dot_to_object(config, path) result[variable] = repr(value) return result + + +def check_section_refs(config: Config, fields: List[str]) -> None: + """Validate fields in the config that refer to other sections or values + (e.g. in the corpora) and make sure that those references exist. + """ + errors = [] + for field in fields: + # If the field doesn't exist in the config, we ignore it + try: + value = util.dot_to_object(config, field) + except KeyError: + continue + try: + util.dot_to_object(config, value) + except KeyError: + msg = f"not a valid section reference: {value}" + errors.append({"loc": field.split("."), "msg": msg}) + if errors: + raise ConfigValidationError(config, errors) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 1d27c7c52..7f8e1dabc 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -128,7 +128,7 @@ def debug_model( goldY = None for e in range(3): if tok2vec: - tok2vec.predict(X) + tok2vec.update([Example.from_dict(x, {}) for x in X]) Y, get_dX = model.begin_update(X) if goldY is None: goldY = _simulate_gold(Y) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index e70195e15..5203c5dbb 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -36,7 +36,7 @@ def init_config_cli( """ Generate a starter config.cfg for training. Based on your requirements specified via the CLI arguments, this command generates a config with the - optimal settings for you use case. This includes the choice of architecture, + optimal settings for your use case. This includes the choice of architecture, pretrained weights and related hyperparameters. DOCS: https://nightly.spacy.io/api/cli#init-config diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index edcd410bd..3119d3a12 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -27,19 +27,32 @@ def project_pull_cli( def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): + # TODO: We don't have tests for this :(. It would take a bit of mockery to + # set up. I guess see if it breaks first? config = load_project_config(project_dir) if remote in config.get("remotes", {}): remote = config["remotes"][remote] storage = RemoteStorage(project_dir, remote) - for cmd in config.get("commands", []): - deps = [project_dir / dep for dep in cmd.get("deps", [])] - if any(not dep.exists() for dep in deps): - continue - cmd_hash = get_command_hash("", "", deps, cmd["script"]) - for output_path in cmd.get("outputs", []): - url = storage.pull(output_path, command_hash=cmd_hash) - yield url, output_path + commands = list(config.get("commands", [])) + # We use a while loop here because we don't know how the commands + # will be ordered. A command might need dependencies from one that's later + # in the list. + while commands: + for i, cmd in enumerate(list(commands)): + deps = [project_dir / dep for dep in cmd.get("deps", [])] + if all(dep.exists() for dep in deps): + cmd_hash = get_command_hash("", "", deps, cmd["script"]) + for output_path in cmd.get("outputs", []): + url = storage.pull(output_path, command_hash=cmd_hash) + yield url, output_path - out_locs = [project_dir / out for out in cmd.get("outputs", [])] - if all(loc.exists() for loc in out_locs): - update_lockfile(project_dir, cmd) + out_locs = [project_dir / out for out in cmd.get("outputs", [])] + if all(loc.exists() for loc in out_locs): + update_lockfile(project_dir, cmd) + # We remove the command from the list here, and break, so that + # we iterate over the loop again. + commands.remove(i) + break + else: + # If we didn't break the for loop, break the while loop. + break diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 0db4c8a59..9a8b9d1d7 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -59,7 +59,8 @@ factory = "parser" [components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 8 +state_type = "parser" +extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 use_upper = false @@ -79,7 +80,8 @@ factory = "ner" [components.ner.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 3 +state_type = "ner" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 use_upper = false @@ -93,6 +95,49 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" {% endif -%} +{% if "entity_linker" in components -%} +[components.entity_linker] +factory = "entity_linker" +get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} +incl_context = true +incl_prior = true + +[components.entity_linker.model] +@architectures = "spacy.EntityLinker.v1" +nO = null + +[components.entity_linker.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.entity_linker.model.tok2vec.pooling] +@layers = "reduce_mean.v1" +{% endif -%} + +{% if "textcat" in components %} +[components.textcat] +factory = "textcat" + +{% if optimize == "accuracy" %} +[components.textcat.model] +@architectures = "spacy.TextCatEnsemble.v1" +exclusive_classes = false +width = 64 +conv_depth = 2 +embed_size = 2000 +window_size = 1 +ngram_size = 1 +nO = null + +{% else -%} +[components.textcat.model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false +{%- endif %} +{%- endif %} + {# NON-TRANSFORMER PIPELINE #} {% else -%} @@ -140,7 +185,8 @@ factory = "parser" [components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 8 +state_type = "parser" +extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 use_upper = true @@ -157,7 +203,8 @@ factory = "ner" [components.ner.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 6 +state_type = "ner" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 use_upper = true @@ -167,10 +214,50 @@ nO = null @architectures = "spacy.Tok2VecListener.v1" width = ${components.tok2vec.model.encode.width} {% endif %} + +{% if "entity_linker" in components -%} +[components.entity_linker] +factory = "entity_linker" +get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} +incl_context = true +incl_prior = true + +[components.entity_linker.model] +@architectures = "spacy.EntityLinker.v1" +nO = null + +[components.entity_linker.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +{% endif %} + +{% if "textcat" in components %} +[components.textcat] +factory = "textcat" + +{% if optimize == "accuracy" %} +[components.textcat.model] +@architectures = "spacy.TextCatEnsemble.v1" +exclusive_classes = false +width = 64 +conv_depth = 2 +embed_size = 2000 +window_size = 1 +ngram_size = 1 +nO = null + +{% else -%} +[components.textcat.model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false +{%- endif %} +{%- endif %} {% endif %} {% for pipe in components %} -{% if pipe not in ["tagger", "parser", "ner"] %} +{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %} {# Other components defined by the user: we just assume they're factories #} [components.{{ pipe }}] factory = "{{ pipe }}" @@ -197,7 +284,7 @@ vectors = "{{ word_vectors }}" {% endif -%} {% if use_transformer -%} accumulate_gradient = {{ transformer["size_factor"] }} -{% endif %} +{% endif -%} dev_corpus = "corpora.dev" train_corpus = "corpora.train" @@ -230,18 +317,3 @@ start = 100 stop = 1000 compound = 1.001 {% endif %} - -[training.score_weights] -{%- if "tagger" in components %} -tag_acc = {{ (1.0 / components|length)|round(2) }} -{%- endif -%} -{%- if "parser" in components %} -dep_uas = 0.0 -dep_las = {{ (1.0 / components|length)|round(2) }} -sents_f = 0.0 -{%- endif %} -{%- if "ner" in components %} -ents_f = {{ (1.0 / components|length)|round(2) }} -ents_p = 0.0 -ents_r = 0.0 -{%- endif -%} diff --git a/spacy/cli/train.py b/spacy/cli/train.py index bf3749c9e..eabc82be0 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -152,7 +152,8 @@ def train( exclude=frozen_components, ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") - print_row, finalize_logger = train_logger(nlp) + with nlp.select_pipes(disable=frozen_components): + print_row, finalize_logger = train_logger(nlp) try: progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) @@ -163,7 +164,8 @@ def train( progress.close() print_row(info) if is_best_checkpoint and output_path is not None: - update_meta(T_cfg, nlp, info) + with nlp.select_pipes(disable=frozen_components): + update_meta(T_cfg, nlp, info) with nlp.use_params(optimizer.averages): nlp.to_disk(output_path / "model-best") progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) @@ -207,10 +209,17 @@ def create_train_batches(iterator, batcher, max_epochs: int): def create_evaluation_callback( nlp: Language, dev_corpus: Callable, weights: Dict[str, float] ) -> Callable[[], Tuple[float, Dict[str, float]]]: + weights = {key: value for key, value in weights.items() if value is not None} + def evaluate() -> Tuple[float, Dict[str, float]]: dev_examples = list(dev_corpus(nlp)) scores = nlp.evaluate(dev_examples) - # Calculate a weighted sum based on score_weights for the main score + # Calculate a weighted sum based on score_weights for the main score. + # We can only consider scores that are ints/floats, not dicts like + # entity scores per type etc. + for key, value in scores.items(): + if key in weights and not isinstance(value, (int, float)): + raise ValueError(Errors.E915.format(name=key, score_type=type(value))) try: weighted_score = sum( scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights @@ -366,7 +375,8 @@ def update_meta( ) -> None: nlp.meta["performance"] = {} for metric in training["score_weights"]: - nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) + if metric is not None: + nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) for pipe_name in nlp.pipe_names: nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] diff --git a/spacy/compat.py b/spacy/compat.py index 2d51ff0ae..6eca18b80 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -22,6 +22,11 @@ try: except ImportError: cupy = None +try: # Python 3.8+ + from typing import Literal +except ImportError: + from typing_extensions import Literal # noqa: F401 + from thinc.api import Optimizer # noqa: F401 pickle = pickle diff --git a/spacy/errors.py b/spacy/errors.py index f276c4d1a..dce5cf51c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -69,7 +69,7 @@ class Warnings: "in problems with the vocab further on in the pipeline.") W030 = ("Some entities could not be aligned in the text \"{text}\" with " "entities \"{entities}\". Use " - "`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" + "`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`" " to check the alignment. Misaligned entities ('-') will be " "ignored during training.") W033 = ("Training a new {model} using a model with no lexeme normalization " @@ -480,6 +480,13 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E915 = ("Can't use score '{name}' to calculate final weighted score. Expected " + "float or int but got: {score_type}. To exclude the score from the " + "final score, set its weight to null in the [training.score_weights] " + "section of your training config.") + E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})") + E917 = ("Received invalid value {value} for 'state_type' in " + "TransitionBasedParser: only 'parser' or 'ner' are valid options.") E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid " "values are an instance of spacy.vocab.Vocab or True to create one" " (default).") diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 695693666..4a71b26a2 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -140,7 +140,6 @@ cdef class KnowledgeBase: self._entries.push_back(entry) self._aliases_table.push_back(alias) - cpdef from_disk(self, loc) cpdef set_entities(self, entity_list, freq_list, vector_list) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index b24ed3a20..ff5382c24 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -9,7 +9,8 @@ from libcpp.vector cimport vector from pathlib import Path import warnings -from os import path + +from spacy import util from .typedefs cimport hash_t from .errors import Errors, Warnings @@ -319,8 +320,14 @@ cdef class KnowledgeBase: return 0.0 - def to_disk(self, loc): - cdef Writer writer = Writer(loc) + def to_disk(self, path): + path = util.ensure_path(path) + if path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + if not path.parent.exists(): + path.parent.mkdir(parents=True) + + cdef Writer writer = Writer(path) writer.write_header(self.get_size_entities(), self.entity_vector_length) # dumping the entity vectors in their original order @@ -359,7 +366,13 @@ cdef class KnowledgeBase: writer.close() - cpdef from_disk(self, loc): + def from_disk(self, path): + path = util.ensure_path(path) + if path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + if not path.exists(): + raise ValueError(Errors.E929.format(loc=path)) + cdef hash_t entity_hash cdef hash_t alias_hash cdef int64_t entry_index @@ -369,7 +382,7 @@ cdef class KnowledgeBase: cdef AliasC alias cdef float vector_element - cdef Reader reader = Reader(loc) + cdef Reader reader = Reader(path) # STEP 0: load header and initialize KB cdef int64_t nr_entities @@ -450,16 +463,13 @@ cdef class KnowledgeBase: cdef class Writer: - def __init__(self, object loc): - if isinstance(loc, Path): - loc = bytes(loc) - if path.exists(loc): - if path.isdir(loc): - raise ValueError(Errors.E928.format(loc=loc)) - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + def __init__(self, path): + assert isinstance(path, Path) + content = bytes(path) + cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content self._fp = fopen(bytes_loc, 'wb') if not self._fp: - raise IOError(Errors.E146.format(path=loc)) + raise IOError(Errors.E146.format(path=path)) fseek(self._fp, 0, 0) def close(self): @@ -496,14 +506,9 @@ cdef class Writer: cdef class Reader: - def __init__(self, object loc): - if isinstance(loc, Path): - loc = bytes(loc) - if not path.exists(loc): - raise ValueError(Errors.E929.format(loc=loc)) - if path.isdir(loc): - raise ValueError(Errors.E928.format(loc=loc)) - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + def __init__(self, path): + content = bytes(path) + cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content self._fp = fopen(bytes_loc, 'rb') if not self._fp: PyErr_SetFromErrno(IOError) diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 270185a4b..923e29a17 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -25,7 +25,6 @@ class Bengali(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/cs/test_text.py b/spacy/lang/cs/test_text.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 0c5e0672b..1a7b19914 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -30,7 +30,6 @@ class Greek(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 1a595b6e7..bf7e9987f 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -29,7 +29,6 @@ class English(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 244534120..f3a6635dc 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -28,7 +28,6 @@ class Persian(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 42241cd8a..72e641d1f 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -33,7 +33,6 @@ class French(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 28a2f0bf2..9672dfd6e 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -28,7 +28,6 @@ class Norwegian(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 1526e41f5..15b6b9de2 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -30,7 +30,6 @@ class Dutch(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 7ddad9893..573dbc6f9 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -35,7 +35,6 @@ class Polish(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "pos_lookup", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index be770e3ec..4a296dd23 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -25,7 +25,6 @@ class Russian(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "pymorphy2", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 6db74cd39..ea314f487 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -31,7 +31,6 @@ class Swedish(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "rule", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index e9936cf7d..006a1cf7f 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -25,7 +25,6 @@ class Ukrainian(Language): "lemmatizer", assigns=["token.lemma"], default_config={"model": None, "mode": "pymorphy2", "lookups": None}, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/language.py b/spacy/language.py index 4dffd9679..a52391419 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -248,9 +248,12 @@ class Language: self._config["nlp"]["pipeline"] = list(self.component_names) self._config["nlp"]["disabled"] = list(self.disabled) self._config["components"] = pipeline - if not self._config["training"].get("score_weights"): - combined_score_weights = combine_score_weights(score_weights) - self._config["training"]["score_weights"] = combined_score_weights + # We're merging the existing score weights back into the combined + # weights to make sure we're preserving custom settings in the config + # but also reflect updates (e.g. new components added) + prev_weights = self._config["training"].get("score_weights", {}) + combined_score_weights = combine_score_weights(score_weights, prev_weights) + self._config["training"]["score_weights"] = combined_score_weights if not srsly.is_json_serializable(self._config): raise ValueError(Errors.E961.format(config=self._config)) return self._config @@ -412,7 +415,6 @@ class Language: assigns: Iterable[str] = SimpleFrozenList(), requires: Iterable[str] = SimpleFrozenList(), retokenizes: bool = False, - scores: Iterable[str] = SimpleFrozenList(), default_score_weights: Dict[str, float] = SimpleFrozenDict(), func: Optional[Callable] = None, ) -> Callable: @@ -430,12 +432,11 @@ class Language: e.g. "token.ent_id". Used for pipeline analyis. retokenizes (bool): Whether the component changes the tokenization. Used for pipeline analysis. - scores (Iterable[str]): All scores set by the component if it's trainable, - e.g. ["ents_f", "ents_r", "ents_p"]. default_score_weights (Dict[str, float]): The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to 1.0 per component and - will be combined and normalized for the whole pipeline. + will be combined and normalized for the whole pipeline. If None, + the score won't be shown in the logs or be weighted. func (Optional[Callable]): Factory function if not used as a decorator. DOCS: https://nightly.spacy.io/api/language#factory @@ -475,7 +476,7 @@ class Language: default_config=default_config, assigns=validate_attrs(assigns), requires=validate_attrs(requires), - scores=scores, + scores=list(default_score_weights.keys()), default_score_weights=default_score_weights, retokenizes=retokenizes, ) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 868f9d6d2..2c40bb3ab 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -2,6 +2,8 @@ from typing import Optional, List from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from thinc.types import Floats2d +from ...errors import Errors +from ...compat import Literal from ...util import registry from .._precomputable_affine import PrecomputableAffine from ..tb_framework import TransitionModel @@ -11,7 +13,8 @@ from ...tokens import Doc @registry.architectures.register("spacy.TransitionBasedParser.v1") def build_tb_parser_model( tok2vec: Model[List[Doc], List[Floats2d]], - nr_feature_tokens: int, + state_type: Literal["parser", "ner"], + extra_state_tokens: bool, hidden_width: int, maxout_pieces: int, use_upper: bool = True, @@ -40,20 +43,12 @@ def build_tb_parser_model( tok2vec (Model[List[Doc], List[Floats2d]]): Subnetwork to map tokens into vector representations. - nr_feature_tokens (int): The number of tokens in the context to use to - construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The - 2, 8 and 13 feature sets are designed for the parser, while the 3 and 6 - feature sets are designed for the NER. The recommended feature sets are - 3 for NER, and 8 for the dependency parser. - - TODO: This feature should be split into two, state_type: ["deps", "ner"] - and extra_state_features: [True, False]. This would map into: - - (deps, False): 8 - (deps, True): 13 - (ner, False): 3 - (ner, True): 6 - + state_type (str): + String value denoting the type of parser model: "parser" or "ner" + extra_state_tokens (bool): Whether or not to use additional tokens in the context + to construct the state vector. Defaults to `False`, which means 3 and 8 + for the NER and parser respectively. When set to `True`, this would become 6 + feature sets (for the NER) or 13 (for the parser). hidden_width (int): The width of the hidden layer. maxout_pieces (int): How many pieces to use in the state prediction layer. Recommended values are 1, 2 or 3. If 1, the maxout non-linearity @@ -68,8 +63,14 @@ def build_tb_parser_model( Usually inferred from data at the beginning of training, or loaded from disk. """ + if state_type == "parser": + nr_feature_tokens = 13 if extra_state_tokens else 8 + elif state_type == "ner": + nr_feature_tokens = 6 if extra_state_tokens else 3 + else: + raise ValueError(Errors.E917.format(value=state_type)) t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) + tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( nO=hidden_width if use_upper else nO, diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index edd791e40..a447434d2 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -15,7 +15,8 @@ from ..training import validate_examples default_model_config = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 8 +state_type = "parser" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 @@ -42,8 +43,14 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, }, - scores=["dep_uas", "dep_las", "dep_las_per_type", "sents_p", "sents_r", "sents_f"], - default_score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0}, + default_score_weights={ + "dep_uas": 0.5, + "dep_las": 0.5, + "dep_las_per_type": None, + "sents_p": None, + "sents_r": None, + "sents_f": 0.0, + }, ) def make_parser( nlp: Language, diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 24bbb067f..9166a69b8 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -25,8 +25,12 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] "overwrite_ents": False, "ent_id_sep": DEFAULT_ENT_ID_SEP, }, - scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, + default_score_weights={ + "ents_f": 1.0, + "ents_p": 0.0, + "ents_r": 0.0, + "ents_per_type": None, + }, ) def make_entity_ruler( nlp: Language, diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 0fd3482c4..c30d09f62 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -21,7 +21,6 @@ from .. import util "lookups": None, "overwrite": False, }, - scores=["lemma_acc"], default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 62ad9e0eb..5fee9a900 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -49,8 +49,7 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] "morphologizer", assigns=["token.morph", "token.pos"], default_config={"model": DEFAULT_MORPH_MODEL}, - scores=["pos_acc", "morph_acc", "morph_per_feat"], - default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5}, + default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( nlp: Language, diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 2fa5c6392..c9b0a5031 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -13,7 +13,8 @@ from ..training import validate_examples default_model_config = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 6 +state_type = "ner" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 @@ -38,8 +39,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "update_with_oracle_cut_size": 100, "model": DEFAULT_NER_MODEL, }, - scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, + default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, ) def make_ner( diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 5700c2b98..2882f6f8b 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -15,7 +15,6 @@ from .. import util "sentencizer", assigns=["token.is_sent_start", "doc.sents"], default_config={"punct_chars": None}, - scores=["sents_p", "sents_r", "sents_f"], default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) def make_sentencizer( diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index a7eb721fd..da85a9cf2 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -36,7 +36,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] "senter", assigns=["token.is_sent_start"], default_config={"model": DEFAULT_SENTER_MODEL}, - scores=["sents_p", "sents_r", "sents_f"], default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) def make_senter(nlp: Language, name: str, model: Model): diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 0d78047ae..3efe29916 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -42,7 +42,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] "tagger", assigns=["token.tag"], default_config={"model": DEFAULT_TAGGER_MODEL}, - scores=["tag_acc"], default_score_weights={"tag_acc": 1.0}, ) def make_tagger(nlp: Language, name: str, model: Model): diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index e7cb62a0d..6b8c0ca65 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -62,18 +62,17 @@ subword_features = true "positive_label": None, "model": DEFAULT_TEXTCAT_MODEL, }, - scores=[ - "cats_score", - "cats_score_desc", - "cats_p", - "cats_r", - "cats_f", - "cats_macro_f", - "cats_macro_auc", - "cats_f_per_type", - "cats_macro_auc_per_type", - ], - default_score_weights={"cats_score": 1.0}, + default_score_weights={ + "cats_score": 1.0, + "cats_score_desc": None, + "cats_p": None, + "cats_r": None, + "cats_f": None, + "cats_macro_f": None, + "cats_macro_auc": None, + "cats_f_per_type": None, + "cats_macro_auc_per_type": None, + }, ) def make_textcat( nlp: Language, diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 721c67a19..9ab4e42b7 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -127,7 +127,7 @@ class Tok2Vec(Pipe): tokvecs = self.model.predict(docs) batch_id = Tok2VecListener.get_batch_id(docs) for listener in self.listeners: - listener.receive(batch_id, tokvecs, None) + listener.receive(batch_id, tokvecs, lambda dX: []) return tokvecs def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None: diff --git a/spacy/schemas.py b/spacy/schemas.py index b0f26dcd7..e34841008 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -211,7 +211,7 @@ class ConfigSchemaTraining(BaseModel): seed: Optional[StrictInt] = Field(..., title="Random seed") gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") - score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model") + score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") optimizer: Optimizer = Field(..., title="The optimizer to use") diff --git a/spacy/scorer.py b/spacy/scorer.py index da22d59d4..c50de3d43 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -240,7 +240,7 @@ class Scorer: pred_per_feat[field].add((gold_i, feat)) for field in per_feat: per_feat[field].score_set( - pred_per_feat.get(field, set()), gold_per_feat.get(field, set()), + pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) ) result = {k: v.to_dict() for k, v in per_feat.items()} return {f"{attr}_per_feat": result} @@ -418,9 +418,9 @@ class Scorer: f_per_type[pred_label].fp += 1 micro_prf = PRFScore() for label_prf in f_per_type.values(): - micro_prf.tp = label_prf.tp - micro_prf.fn = label_prf.fn - micro_prf.fp = label_prf.fp + micro_prf.tp += label_prf.tp + micro_prf.fn += label_prf.fn + micro_prf.fp += label_prf.fp n_cats = len(f_per_type) + 1e-100 macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index c43d2c58e..88e0646b3 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -144,6 +144,29 @@ def test_kb_empty(nlp): entity_linker.begin_training(lambda: []) +def test_kb_serialize(nlp): + """Test serialization of the KB""" + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + with make_tempdir() as d: + # normal read-write behaviour + mykb.to_disk(d / "kb") + mykb.from_disk(d / "kb") + mykb.to_disk(d / "kb.file") + mykb.from_disk(d / "kb.file") + mykb.to_disk(d / "new" / "kb") + mykb.from_disk(d / "new" / "kb") + # allow overwriting an existing file + mykb.to_disk(d / "kb.file") + with pytest.raises(ValueError): + # can not write to a directory + mykb.to_disk(d) + with pytest.raises(ValueError): + # can not read from a directory + mykb.from_disk(d) + with pytest.raises(ValueError): + # can not read from an unknown file + mykb.from_disk(d / "unknown" / "kb") + def test_candidate_generation(nlp): """Test correct candidate generation""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 881460704..4c197005e 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -359,12 +359,8 @@ def test_language_factories_scores(): func = lambda nlp, name: lambda doc: doc weights1 = {"a1": 0.5, "a2": 0.5} weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} - Language.factory( - f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func, - ) - Language.factory( - f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func, - ) + Language.factory(f"{name}1", default_score_weights=weights1, func=func) + Language.factory(f"{name}2", default_score_weights=weights2, func=func) meta1 = Language.get_factory_meta(f"{name}1") assert meta1.default_score_weights == weights1 meta2 = Language.get_factory_meta(f"{name}2") @@ -376,6 +372,21 @@ def test_language_factories_scores(): cfg = nlp.config["training"] expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} assert cfg["score_weights"] == expected_weights + # Test with custom defaults + config = nlp.config.copy() + config["training"]["score_weights"]["a1"] = 0.0 + config["training"]["score_weights"]["b3"] = 1.0 + nlp = English.from_config(config) + score_weights = nlp.config["training"]["score_weights"] + expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34} + assert score_weights == expected + # Test with null values + config = nlp.config.copy() + config["training"]["score_weights"]["a1"] = None + nlp = English.from_config(config) + score_weights = nlp.config["training"]["score_weights"] + expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35} + assert score_weights == expected def test_pipe_factories_from_source(): diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 99b5132ca..232b53e1d 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -8,6 +8,7 @@ from spacy.language import Language from spacy.pipeline import TextCategorizer from spacy.tokens import Doc from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL +from spacy.scorer import Scorer from ..util import make_tempdir from ...cli.train import verify_textcat_config @@ -224,3 +225,31 @@ def test_positive_class_not_binary(): assert textcat.labels == ("SOME", "THING", "POS") with pytest.raises(ValueError): verify_textcat_config(nlp, pipe_config) + +def test_textcat_evaluation(): + train_examples = [] + nlp = English() + ref1 = nlp("one") + ref1.cats = {"winter": 1.0, "summer": 1.0, "spring": 1.0, "autumn": 1.0} + pred1 = nlp("one") + pred1.cats = {"winter": 1.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0} + train_examples.append(Example(pred1, ref1)) + + ref2 = nlp("two") + ref2.cats = {"winter": 0.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0} + pred2 = nlp("two") + pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0} + train_examples.append(Example(pred2, ref2)) + + scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]) + assert scores["cats_f_per_type"]["winter"]["p"] == 1/2 + assert scores["cats_f_per_type"]["winter"]["r"] == 1/1 + assert scores["cats_f_per_type"]["summer"]["p"] == 0 + assert scores["cats_f_per_type"]["summer"]["r"] == 0/1 + assert scores["cats_f_per_type"]["spring"]["p"] == 1/1 + assert scores["cats_f_per_type"]["spring"]["r"] == 1/2 + assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2 + assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2 + + assert scores["cats_micro_p"] == 4/5 + assert scores["cats_micro_r"] == 4/6 diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 2e514f490..985314217 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -169,3 +169,22 @@ def test_tok2vec_listener(): nlp.select_pipes(disable="tok2vec") assert nlp.pipe_names == ["tagger"] nlp("Running the pipeline with the Tok2Vec component disabled.") + + +def test_tok2vec_listener_callback(): + orig_config = Config().from_str(cfg_string) + nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + assert nlp.pipe_names == ["tok2vec", "tagger"] + tagger = nlp.get_pipe("tagger") + tok2vec = nlp.get_pipe("tok2vec") + nlp._link_components() + docs = [nlp.make_doc("A random sentence")] + tok2vec.model.initialize(X=docs) + gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs] + label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")] + tagger.model.initialize(X=docs, Y=label_sample) + docs = [nlp.make_doc("Another entirely random sentence")] + tok2vec.update([Example.from_dict(x, {}) for x in docs]) + Y, get_dX = tagger.model.begin_update(docs) + # assure that the backprop call works (and doesn't hit a 'None' callback) + assert get_dX(Y) is not None diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 4e58c347e..7b7ddfe0d 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -3,7 +3,7 @@ from spacy.pipeline import Pipe from spacy.matcher import PhraseMatcher, Matcher from spacy.tokens import Doc, Span, DocBin from spacy.training import Example, Corpus -from spacy.training.converters import json2docs +from spacy.training.converters import json_to_docs from spacy.vocab import Vocab from spacy.lang.en import English from spacy.util import minibatch, ensure_path, load_model @@ -425,7 +425,7 @@ def test_issue4402(): attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] with make_tempdir() as tmpdir: output_file = tmpdir / "test4402.spacy" - docs = json2docs([json_data]) + docs = json_to_docs([json_data]) data = DocBin(docs=docs, attrs=attrs).to_bytes() with output_file.open("wb") as file_: file_.write(data) diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index 9454d7f0c..e351858f5 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -1,7 +1,7 @@ import pytest from spacy.tokens import Doc, Span, DocBin from spacy.training import Example -from spacy.training.converters.conllu2docs import conllu2docs +from spacy.training.converters.conllu_to_docs import conllu_to_docs from spacy.lang.en import English from spacy.kb import KnowledgeBase from spacy.vocab import Vocab @@ -82,7 +82,7 @@ def test_issue4651_without_phrase_matcher_attr(): def test_issue4665(): """ - conllu2json should not raise an exception if the HEAD column contains an + conllu_to_docs should not raise an exception if the HEAD column contains an underscore """ input_data = """ @@ -105,7 +105,7 @@ def test_issue4665(): 17 . _ PUNCT . _ _ punct _ _ 18 ] _ PUNCT -RRB- _ _ punct _ _ """ - conllu2docs(input_data) + conllu_to_docs(input_data) def test_issue4674(): diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 1e17b3212..ec7544456 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -67,7 +67,8 @@ width = ${components.tok2vec.model.width} parser_config_string = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 99 +state_type = "parser" +extra_state_tokens = false hidden_width = 66 maxout_pieces = 2 @@ -95,7 +96,11 @@ def my_parser(): MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), ) parser = build_tb_parser_model( - tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 + tok2vec=tok2vec, + state_type="parser", + extra_state_tokens=True, + hidden_width=65, + maxout_pieces=5, ) return parser @@ -340,3 +345,13 @@ def test_config_auto_fill_extra_fields(): assert "extra" not in nlp.config["training"] # Make sure the config generated is valid load_model_from_config(nlp.config) + + +def test_config_validate_literal(): + nlp = English() + config = Config().from_str(parser_config_string) + config["model"]["state_type"] = "nonsense" + with pytest.raises(ConfigValidationError): + nlp.add_pipe("parser", config=config) + config["model"]["state_type"] = "ner" + nlp.add_pipe("parser", config=config) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index a9c9d8ca5..a66ab8de1 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,20 +1,21 @@ import pytest from click import NoSuchOption -from spacy.training import docs_to_json, biluo_tags_from_offsets -from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs +from spacy.training import docs_to_json, offsets_to_biluo_tags +from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR -from thinc.config import ConfigValidationError +from spacy.cli.debug_config import check_section_refs +from thinc.config import ConfigValidationError, Config import srsly import os from .util import make_tempdir -def test_cli_converters_conllu2json(): +def test_cli_converters_conllu_to_docs(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", @@ -23,7 +24,7 @@ def test_cli_converters_conllu2json(): "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", ] input_data = "\n".join(lines) - converted_docs = conllu2docs(input_data, n_sents=1) + converted_docs = conllu_to_docs(input_data, n_sents=1) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 @@ -39,7 +40,7 @@ def test_cli_converters_conllu2json(): ent_offsets = [ (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] ] - biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PER", "L-PER", "O"] @@ -62,9 +63,9 @@ def test_cli_converters_conllu2json(): ), ], ) -def test_cli_converters_conllu2json_name_ner_map(lines): +def test_cli_converters_conllu_to_docs_name_ner_map(lines): input_data = "\n".join(lines) - converted_docs = conllu2docs( + converted_docs = conllu_to_docs( input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""} ) assert len(converted_docs) == 1 @@ -83,11 +84,11 @@ def test_cli_converters_conllu2json_name_ner_map(lines): ent_offsets = [ (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] ] - biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] -def test_cli_converters_conllu2json_subtokens(): +def test_cli_converters_conllu_to_docs_subtokens(): # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", @@ -98,7 +99,7 @@ def test_cli_converters_conllu2json_subtokens(): "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", ] input_data = "\n".join(lines) - converted_docs = conllu2docs( + converted_docs = conllu_to_docs( input_data, n_sents=1, merge_subtokens=True, append_morphology=True ) assert len(converted_docs) == 1 @@ -132,11 +133,11 @@ def test_cli_converters_conllu2json_subtokens(): ent_offsets = [ (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] ] - biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "U-PER", "O", "O"] -def test_cli_converters_iob2json(): +def test_cli_converters_iob_to_docs(): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", @@ -144,7 +145,7 @@ def test_cli_converters_iob2json(): "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O", ] input_data = "\n".join(lines) - converted_docs = iob2docs(input_data, n_sents=10) + converted_docs = iob_to_docs(input_data, n_sents=10) assert len(converted_docs) == 1 converted = docs_to_json(converted_docs) assert converted["id"] == 0 @@ -161,7 +162,7 @@ def test_cli_converters_iob2json(): assert ent.text in ["New York City", "London"] -def test_cli_converters_conll_ner2json(): +def test_cli_converters_conll_ner_to_docs(): lines = [ "-DOCSTART- -X- O O", "", @@ -211,7 +212,7 @@ def test_cli_converters_conll_ner2json(): ".\t.\t_\tO", ] input_data = "\n".join(lines) - converted_docs = conll_ner2docs(input_data, n_sents=10) + converted_docs = conll_ner_to_docs(input_data, n_sents=10) assert len(converted_docs) == 1 converted = docs_to_json(converted_docs) assert converted["id"] == 0 @@ -413,3 +414,15 @@ def test_string_to_list(value): def test_string_to_list_intify(value): assert string_to_list(value, intify=False) == ["1", "2", "3"] assert string_to_list(value, intify=True) == [1, 2, 3] + + +def test_check_section_refs(): + config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}} + config = Config(config) + # Valid section reference + check_section_refs(config, ["a.b.c"]) + # Section that doesn't exist in this config + check_section_refs(config, ["x.y.z"]) + # Invalid section reference + with pytest.raises(ConfigValidationError): + check_section_refs(config, ["a.b.c", "f.g"]) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index a1406c14a..2825f1703 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -2,7 +2,7 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx from spacy.training import Example -from spacy.training.iob_utils import biluo_tags_from_offsets +from spacy.training.iob_utils import offsets_to_biluo_tags from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from spacy.lang.en import English @@ -186,7 +186,7 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)], ) - entities = biluo_tags_from_offsets(doc, annot["entities"]) + entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) # a hack for sentence boundaries example.predicted[1].is_sent_start = False @@ -211,7 +211,7 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)], ) - entities = biluo_tags_from_offsets(doc, annot["entities"]) + entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) # a hack for sentence boundaries example.predicted[1].is_sent_start = False diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 4cab5b015..a04e6aadd 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -1,9 +1,9 @@ import numpy -from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment -from spacy.training import spans_from_biluo_tags, iob_to_biluo +from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment +from spacy.training import biluo_tags_to_spans, iob_to_biluo from spacy.training import Corpus, docs_to_json from spacy.training.example import Example -from spacy.training.converters import json2docs +from spacy.training.converters import json_to_docs from spacy.training.augment import make_orth_variants_example from spacy.lang.en import English from spacy.tokens import Doc, DocBin @@ -69,7 +69,7 @@ def test_gold_biluo_U(en_vocab): spaces = [True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to London"), "LOC")] - tags = biluo_tags_from_offsets(doc, entities) + tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "U-LOC", "O"] @@ -78,7 +78,7 @@ def test_gold_biluo_BL(en_vocab): spaces = [True, True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")] - tags = biluo_tags_from_offsets(doc, entities) + tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"] @@ -87,7 +87,7 @@ def test_gold_biluo_BIL(en_vocab): spaces = [True, True, True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - tags = biluo_tags_from_offsets(doc, entities) + tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] @@ -100,7 +100,7 @@ def test_gold_biluo_overlap(en_vocab): (len("I flew to "), len("I flew to San Francisco"), "LOC"), ] with pytest.raises(ValueError): - biluo_tags_from_offsets(doc, entities) + offsets_to_biluo_tags(doc, entities) def test_gold_biluo_misalign(en_vocab): @@ -109,7 +109,7 @@ def test_gold_biluo_misalign(en_vocab): doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] with pytest.warns(UserWarning): - tags = biluo_tags_from_offsets(doc, entities) + tags = offsets_to_biluo_tags(doc, entities) assert tags == ["O", "O", "O", "-", "-", "-"] @@ -155,7 +155,7 @@ def test_example_from_dict_some_ner(en_vocab): @pytest.mark.filterwarnings("ignore::UserWarning") -def test_json2docs_no_ner(en_vocab): +def test_json_to_docs_no_ner(en_vocab): data = [ { "id": 1, @@ -191,7 +191,7 @@ def test_json2docs_no_ner(en_vocab): ], } ] - docs = json2docs(data) + docs = json_to_docs(data) assert len(docs) == 1 for doc in docs: assert not doc.has_annotation("ENT_IOB") @@ -358,9 +358,9 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer): biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] offsets = [(10, 24, "LOC"), (29, 35, "GPE")] doc = en_tokenizer(text) - biluo_tags_converted = biluo_tags_from_offsets(doc, offsets) + biluo_tags_converted = offsets_to_biluo_tags(doc, offsets) assert biluo_tags_converted == biluo_tags - offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) + offsets_converted = biluo_tags_to_offsets(doc, biluo_tags) offsets_converted = [ent for ent in offsets if ent[2]] assert offsets_converted == offsets @@ -368,7 +368,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer): def test_biluo_spans(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] - spans = spans_from_biluo_tags(doc, biluo_tags) + spans = biluo_tags_to_spans(doc, biluo_tags) spans = [span for span in spans if span.label_] assert len(spans) == 2 assert spans[0].text == "Silicon Valley" diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 35e67f696..9172dde25 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -2,8 +2,8 @@ from .corpus import Corpus # noqa: F401 from .example import Example, validate_examples # noqa: F401 from .align import Alignment # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 -from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401 -from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401 +from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401 +from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401 from .gold_io import docs_to_json, read_json_file # noqa: F401 from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401 from .loggers import console_logger, wandb_logger # noqa: F401 diff --git a/spacy/training/converters/__init__.py b/spacy/training/converters/__init__.py index 15f025a08..e91b6aaa6 100644 --- a/spacy/training/converters/__init__.py +++ b/spacy/training/converters/__init__.py @@ -1,4 +1,4 @@ -from .iob2docs import iob2docs # noqa: F401 -from .conll_ner2docs import conll_ner2docs # noqa: F401 -from .json2docs import json2docs # noqa: F401 -from .conllu2docs import conllu2docs # noqa: F401 +from .iob_to_docs import iob_to_docs # noqa: F401 +from .conll_ner_to_docs import conll_ner_to_docs # noqa: F401 +from .json_to_docs import json_to_docs # noqa: F401 +from .conllu_to_docs import conllu_to_docs # noqa: F401 diff --git a/spacy/training/converters/conll_ner2docs.py b/spacy/training/converters/conll_ner_to_docs.py similarity index 99% rename from spacy/training/converters/conll_ner2docs.py rename to spacy/training/converters/conll_ner_to_docs.py index 8dcaf2599..3b851039c 100644 --- a/spacy/training/converters/conll_ner2docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -7,7 +7,7 @@ from ...tokens import Doc, Span from ...util import load_model -def conll_ner2docs( +def conll_ner_to_docs( input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs ): """ diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu_to_docs.py similarity index 97% rename from spacy/training/converters/conllu2docs.py rename to spacy/training/converters/conllu_to_docs.py index b4d8b3ac4..18a2b6a93 100644 --- a/spacy/training/converters/conllu2docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -1,13 +1,13 @@ import re -from .conll_ner2docs import n_sents_info -from ...training import iob_to_biluo, spans_from_biluo_tags +from .conll_ner_to_docs import n_sents_info +from ...training import iob_to_biluo, biluo_tags_to_spans from ...tokens import Doc, Token, Span from ...vocab import Vocab from wasabi import Printer -def conllu2docs( +def conllu_to_docs( input_data, n_sents=10, append_morphology=False, @@ -78,7 +78,7 @@ def read_conllx( if lines: while lines[0].startswith("#"): lines.pop(0) - doc = doc_from_conllu_sentence( + doc = conllu_sentence_to_doc( vocab, lines, ner_tag_pattern, @@ -128,7 +128,7 @@ def get_entities(lines, tag_pattern, ner_map=None): return iob_to_biluo(iob) -def doc_from_conllu_sentence( +def conllu_sentence_to_doc( vocab, lines, ner_tag_pattern, @@ -215,7 +215,7 @@ def doc_from_conllu_sentence( doc[i]._.merged_lemma = lemmas[i] doc[i]._.merged_spaceafter = spaces[i] ents = get_entities(lines, ner_tag_pattern, ner_map) - doc.ents = spans_from_biluo_tags(doc, ents) + doc.ents = biluo_tags_to_spans(doc, ents) if merge_subtokens: doc = merge_conllu_subtokens(lines, doc) diff --git a/spacy/training/converters/iob2docs.py b/spacy/training/converters/iob_to_docs.py similarity index 95% rename from spacy/training/converters/iob2docs.py rename to spacy/training/converters/iob_to_docs.py index 2f6742fea..bfd981649 100644 --- a/spacy/training/converters/iob2docs.py +++ b/spacy/training/converters/iob_to_docs.py @@ -1,13 +1,13 @@ from wasabi import Printer -from .conll_ner2docs import n_sents_info +from .conll_ner_to_docs import n_sents_info from ...vocab import Vocab from ...training import iob_to_biluo, tags_to_entities from ...tokens import Doc, Span from ...util import minibatch -def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs): +def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs): """ Convert IOB files with one sentence per line and tags separated with '|' into Doc objects so they can be saved. IOB and IOB2 are accepted. diff --git a/spacy/training/converters/json2docs.py b/spacy/training/converters/json_to_docs.py similarity index 82% rename from spacy/training/converters/json2docs.py rename to spacy/training/converters/json_to_docs.py index 342f94848..d7df1d6f9 100644 --- a/spacy/training/converters/json2docs.py +++ b/spacy/training/converters/json_to_docs.py @@ -1,12 +1,12 @@ import srsly from ..gold_io import json_iterate, json_to_annotations -from ..example import annotations2doc +from ..example import annotations_to_doc from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ...util import load_model from ...lang.xx import MultiLanguage -def json2docs(input_data, model=None, **kwargs): +def json_to_docs(input_data, model=None, **kwargs): nlp = load_model(model) if model is not None else MultiLanguage() if not isinstance(input_data, bytes): if not isinstance(input_data, str): @@ -17,6 +17,6 @@ def json2docs(input_data, model=None, **kwargs): for json_para in json_to_annotations(json_doc): example_dict = _fix_legacy_dict_data(json_para) tok_dict, doc_dict = _parse_example_dict_data(example_dict) - doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) + doc = annotations_to_doc(nlp.vocab, tok_dict, doc_dict) docs.append(doc) return docs diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 371b4a06a..fbf05b224 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -7,13 +7,13 @@ from ..tokens.span cimport Span from ..tokens.span import Span from ..attrs import IDS from .align import Alignment -from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc -from .iob_utils import spans_from_biluo_tags +from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags +from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings from ..pipeline._parser_internals import nonproj -cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): +cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): """ Create a Doc from dictionaries with token and doc annotations. """ attrs, array = _annot2array(vocab, tok_annot, doc_annot) output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) @@ -92,7 +92,7 @@ cdef class Example: tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] return Example( predicted, - annotations2doc(predicted.vocab, tok_dict, doc_dict) + annotations_to_doc(predicted.vocab, tok_dict, doc_dict) ) @property @@ -176,7 +176,7 @@ cdef class Example: return [None] * len(self.x) # should this be 'missing' instead of 'None' ? x_ents = self.get_aligned_spans_y2x(self.y.ents) # Default to 'None' for missing values - x_tags = biluo_tags_from_offsets( + x_tags = offsets_to_biluo_tags( self.x, [(e.start_char, e.end_char, e.label_) for e in x_ents], missing=None @@ -195,7 +195,7 @@ cdef class Example: return { "doc_annotation": { "cats": dict(self.reference.cats), - "entities": biluo_tags_from_doc(self.reference), + "entities": doc_to_biluo_tags(self.reference), "links": self._links_to_dict() }, "token_annotation": { @@ -295,12 +295,12 @@ def _add_entities_to_doc(doc, ner_data): elif isinstance(ner_data[0], tuple): return _add_entities_to_doc( doc, - biluo_tags_from_offsets(doc, ner_data) + offsets_to_biluo_tags(doc, ner_data) ) elif isinstance(ner_data[0], str) or ner_data[0] is None: return _add_entities_to_doc( doc, - spans_from_biluo_tags(doc, ner_data) + biluo_tags_to_spans(doc, ner_data) ) elif isinstance(ner_data[0], Span): # Ugh, this is super messy. Really hard to set O entities @@ -388,7 +388,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): # This is annoying but to convert the offsets we need a Doc # that has the target tokenization. reference = Doc(vocab, words=words, spaces=spaces) - biluo = biluo_tags_from_offsets(reference, biluo_or_offsets) + biluo = offsets_to_biluo_tags(reference, biluo_or_offsets) else: biluo = biluo_or_offsets ent_iobs = [] diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx index b58df0d71..524da0a16 100644 --- a/spacy/training/gold_io.pyx +++ b/spacy/training/gold_io.pyx @@ -3,7 +3,7 @@ import srsly from .. import util from ..errors import Warnings from ..tokens import Doc -from .iob_utils import biluo_tags_from_offsets, tags_to_entities +from .iob_utils import offsets_to_biluo_tags, tags_to_entities import json @@ -32,7 +32,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): if ent.kb_id_: link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} json_para["links"].append(link_dict) - biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag) + biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag) attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB") include_annotation = {attr: doc.has_annotation(attr) for attr in attrs} for j, sent in enumerate(doc.sents): diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index ceb5e16b8..91fc40205 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -1,9 +1,11 @@ +from typing import List, Tuple, Iterable, Union, Iterator import warnings + from ..errors import Errors, Warnings -from ..tokens import Span +from ..tokens import Span, Doc -def iob_to_biluo(tags): +def iob_to_biluo(tags: Iterable[str]) -> List[str]: out = [] tags = list(tags) while tags: @@ -12,7 +14,7 @@ def iob_to_biluo(tags): return out -def biluo_to_iob(tags): +def biluo_to_iob(tags: Iterable[str]) -> List[str]: out = [] for tag in tags: if tag is None: @@ -23,12 +25,12 @@ def biluo_to_iob(tags): return out -def _consume_os(tags): +def _consume_os(tags: List[str]) -> Iterator[str]: while tags and tags[0] == "O": yield tags.pop(0) -def _consume_ent(tags): +def _consume_ent(tags: List[str]) -> List[str]: if not tags: return [] tag = tags.pop(0) @@ -50,15 +52,17 @@ def _consume_ent(tags): return [start] + middle + [end] -def biluo_tags_from_doc(doc, missing="O"): - return biluo_tags_from_offsets( +def doc_to_biluo_tags(doc: Doc, missing: str = "O"): + return offsets_to_biluo_tags( doc, [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], missing=missing, ) -def biluo_tags_from_offsets(doc, entities, missing="O"): +def offsets_to_biluo_tags( + doc: Doc, entities: Iterable[Tuple[int, int, Union[str, int]]], missing: str = "O" +) -> List[str]: """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO). @@ -69,7 +73,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): the original string. RETURNS (list): A list of unicode strings, describing the tags. Each tag string will be of the form either "", "O" or "{action}-{label}", where - action is one of "B", "I", "L", "U". The string "-" is used where the + action is one of "B", "I", "L", "U". The missing label is used where the entity offsets don't align with the tokenization in the `Doc` object. The training algorithm will view these as missing values. "O" denotes a non-entity token. "B" denotes the beginning of a multi-token entity, @@ -80,12 +84,11 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): >>> text = 'I like London.' >>> entities = [(len('I like '), len('I like London'), 'LOC')] >>> doc = nlp.tokenizer(text) - >>> tags = biluo_tags_from_offsets(doc, entities) + >>> tags = offsets_to_biluo_tags(doc, entities) >>> assert tags == ["O", "O", 'U-LOC', "O"] """ # Ensure no overlapping entity labels exist tokens_in_ents = {} - starts = {token.idx: token.i for token in doc} ends = {token.idx + len(token): token.i for token in doc} biluo = ["-" for _ in doc] @@ -109,7 +112,6 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): ) ) tokens_in_ents[token_index] = (start_char, end_char, label) - start_token = starts.get(start_char) end_token = ends.get(end_char) # Only interested if the tokenization is correct @@ -143,7 +145,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): return biluo -def spans_from_biluo_tags(doc, tags): +def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]: """Encode per-token tags following the BILUO scheme into Span object, e.g. to overwrite the doc.ents. @@ -161,7 +163,9 @@ def spans_from_biluo_tags(doc, tags): return spans -def offsets_from_biluo_tags(doc, tags): +def biluo_tags_to_offsets( + doc: Doc, tags: Iterable[str] +) -> List[Tuple[int, int, Union[str, int]]]: """Encode per-token tags following the BILUO scheme into entity offsets. doc (Doc): The document that the BILUO tags refer to. @@ -172,12 +176,12 @@ def offsets_from_biluo_tags(doc, tags): `end` will be character-offset integers denoting the slice into the original string. """ - spans = spans_from_biluo_tags(doc, tags) + spans = biluo_tags_to_spans(doc, tags) return [(span.start_char, span.end_char, span.label_) for span in spans] -def tags_to_entities(tags): - """ Note that the end index returned by this function is inclusive. +def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]: + """Note that the end index returned by this function is inclusive. To use it for Span creation, increment the end by 1.""" entities = [] start = None @@ -209,3 +213,9 @@ def tags_to_entities(tags): else: raise ValueError(Errors.E068.format(tag=tag)) return entities + + +# Fallbacks to make backwards-compat easier +offsets_from_biluo_tags = biluo_tags_to_offsets +spans_from_biluo_tags = biluo_tags_to_spans +biluo_tags_from_offsets = offsets_to_biluo_tags diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 92b598033..0f054d433 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -11,9 +11,12 @@ def console_logger(): def setup_printer( nlp: "Language", ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: - score_cols = list(nlp.config["training"]["score_weights"]) + # we assume here that only components are enabled that should be trained & logged + logged_pipes = nlp.pipe_names + score_weights = nlp.config["training"]["score_weights"] + score_cols = [col for col, value in score_weights.items() if value is not None] score_widths = [max(len(col), 6) for col in score_cols] - loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] + loss_cols = [f"Loss {pipe}" for pipe in logged_pipes] loss_widths = [max(len(col), 8) for col in loss_cols] table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] table_header = [col.upper() for col in table_header] @@ -26,7 +29,7 @@ def console_logger(): try: losses = [ "{0:.2f}".format(float(info["losses"][pipe_name])) - for pipe_name in nlp.pipe_names + for pipe_name in logged_pipes ] except KeyError as e: raise KeyError( @@ -38,10 +41,15 @@ def console_logger(): ) from None scores = [] for col in score_cols: - score = float(info["other_scores"].get(col, 0.0)) - if col != "speed": - score *= 100 - scores.append("{0:.2f}".format(score)) + score = info["other_scores"].get(col, 0.0) + try: + score = float(score) + if col != "speed": + score *= 100 + scores.append("{0:.2f}".format(score)) + except TypeError: + err = Errors.E916.format(name=col, score_type=type(score)) + raise ValueError(err) from None data = ( [info["epoch"], info["step"]] + losses diff --git a/spacy/util.py b/spacy/util.py index 93000ea27..709da8d29 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", # Default order of sections in the config.cfg. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. # fmt: off -CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"] +CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"] # fmt: on @@ -1202,21 +1202,38 @@ def get_arg_names(func: Callable) -> List[str]: return list(set([*argspec.args, *argspec.kwonlyargs])) -def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]: +def combine_score_weights( + weights: List[Dict[str, float]], + overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(), +) -> Dict[str, float]: """Combine and normalize score weights defined by components, e.g. {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}. weights (List[dict]): The weights defined by the components. + overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that + should be preserved. RETURNS (Dict[str, float]): The combined and normalized weights. """ + # We first need to extract all None/null values for score weights that + # shouldn't be shown in the table *or* be weighted result = {} + all_weights = [] for w_dict in weights: + filtered_weights = {} + for key, value in w_dict.items(): + value = overrides.get(key, value) + if value is None: + result[key] = None + else: + filtered_weights[key] = value + all_weights.append(filtered_weights) + for w_dict in all_weights: # We need to account for weights that don't sum to 1.0 and normalize # the score weights accordingly, then divide score by the number of # components. total = sum(w_dict.values()) for key, value in w_dict.items(): - weight = round(value / total / len(weights), 2) + weight = round(value / total / len(all_weights), 2) result[key] = result.get(key, 0.0) + weight return result diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 30d863b17..ef2666ec0 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -414,7 +414,8 @@ one component. > ```ini > [model] > @architectures = "spacy.TransitionBasedParser.v1" -> nr_feature_tokens = 6 +> state_type = "ner" +> extra_state_tokens = false > hidden_width = 64 > maxout_pieces = 2 > @@ -446,15 +447,16 @@ consists of either two or three subnetworks: state representation. If not present, the output from the lower model is used as action scores directly. -| Name | Description | -| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ | -| `hidden_width` | The width of the hidden layer. ~~int~~ | -| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | -| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | -| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | +| Name | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ | +| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ | +| `hidden_width` | The width of the hidden layer. ~~int~~ | +| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | +| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | +| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 3a214428b..e3b3900be 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -275,7 +275,7 @@ $ python -m spacy convert ./data.json ./output.spacy > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token > representing a `PERSON` entity. The -> [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) function +> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function > can help you convert entity offsets to the right format. ```python diff --git a/website/docs/api/language.md b/website/docs/api/language.md index a7b9c0d88..dd3cc57dd 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -145,17 +145,16 @@ examples, see the > ) > ``` -| Name | Description | -| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | The name of the component factory. ~~str~~ | -| _keyword-only_ | | -| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | -| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | -| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ | -| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | +| Name | Description | +| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | The name of the component factory. ~~str~~ | +| _keyword-only_ | | +| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | +| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | +| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ | +| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | ## Language.\_\_call\_\_ {#call tag="method"} @@ -1036,12 +1035,12 @@ provided by the [`@Language.component`](/api/language#component) or component is defined and stored on the `Language` class for each component instance and factory instance. -| Name | Description | -| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `factory` | The name of the registered component factory. ~~str~~ | -| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | -| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~  | -| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~  | -| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | -| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ | +| Name | Description | +| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `factory` | The name of the registered component factory. ~~str~~ | +| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | +| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~  | +| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~  | +| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ | +| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Based on the `default_score_weights` and used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 7afe02403..f36be0806 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -619,7 +619,7 @@ sequences in the batch. ## Training data and alignment {#gold source="spacy/training"} -### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} +### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"} Encode labelled spans into per-token tags, using the [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, @@ -632,14 +632,20 @@ the beginning of a multi-token entity, `I` the inside of an entity of three or more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a single-token entity. + + +This method was previously available as `spacy.gold.biluo_tags_from_offsets`. + + + > #### Example > > ```python -> from spacy.training import biluo_tags_from_offsets +> from spacy.training import offsets_to_biluo_tags > > doc = nlp("I like London.") > entities = [(7, 13, "LOC")] -> tags = biluo_tags_from_offsets(doc, entities) +> tags = offsets_to_biluo_tags(doc, entities) > assert tags == ["O", "O", "U-LOC", "O"] > ``` @@ -647,21 +653,28 @@ single-token entity. | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~ | | `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ | +| `missing` | The label used for missing values, e.g. if tokenization doesn't align with the entity offsets. Defaults to `"O"`. ~~str~~ | | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ | -### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} +### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"} Encode per-token tags following the [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets. + + +This method was previously available as `spacy.gold.offsets_from_biluo_tags`. + + + > #### Example > > ```python -> from spacy.training import offsets_from_biluo_tags +> from spacy.training import biluo_tags_to_offsets > > doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] -> entities = offsets_from_biluo_tags(doc, tags) +> entities = biluo_tags_to_offsets(doc, tags) > assert entities == [(7, 13, "LOC")] > ``` @@ -671,21 +684,27 @@ Encode per-token tags following the | `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ | -### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} +### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"} Encode per-token tags following the [BILUO scheme](/usage/linguistic-features#accessing-ner) into [`Span`](/api/span) objects. This can be used to create entity spans from token-based tags, e.g. to overwrite the `doc.ents`. + + +This method was previously available as `spacy.gold.spans_from_biluo_tags`. + + + > #### Example > > ```python -> from spacy.training import spans_from_biluo_tags +> from spacy.training import biluo_tags_to_spans > > doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] -> doc.ents = spans_from_biluo_tags(doc, tags) +> doc.ents = biluo_tags_to_spans(doc, tags) > ``` | Name | Description | diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 33163f306..5b193d3a4 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -1,24 +1,19 @@ import { Help } from 'components/typography'; import Link from 'components/link' - +
-| System | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | -| ------------------------------------------------------------------------- | ----------------: | ----------------: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | -| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | -| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | -| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | | -| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | _n/a_2 | _n/a_2 | 88.8 | 234 | 2k | -| Flair | - | 97.9 | 89.3 | | | +| Pipeline | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | +| ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | +| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | +| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | +| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
-**Accuracy and speed on the -[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**
**1. ** -[Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). **2. ** _Coming soon_: -Qi et al. don't report parsing and tagging results on OntoNotes. We're working -on training Stanza on this corpus to allow direct comparison. +**Full pipeline accuracy and speed** on the +[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.
@@ -26,18 +21,24 @@ on training Stanza on this corpus to allow direct comparison.
-| System | POS | UAS | LAS | -| ------------------------------------------------------------------------------ | ---: | ---: | ---: | -| spaCy RoBERTa (2020) | | | | -| spaCy CNN (2020) | | | | -| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 | -| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 | +| Named Entity Recognition System | OntoNotes | CoNLL '03 | +| ------------------------------------------------------------------------------ | --------: | --------: | +| spaCy RoBERTa (2020) | | 92.2 | +| spaCy CNN (2020) | 85.3 | 88.4 | +| spaCy CNN (2017) | 86.4 | | +| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | 88.8 | 92.1 | +| Flair2 | 89.7 | 93.1 | +| BERT Base3 | - | 92.4 |
-**Accuracy on the Penn Treebank.** See -[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more -results. +**Named entity recognition accuracy** on the +[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) and +[CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See +[NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for +more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). +**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/). **3. +** [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805).
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index a855d703c..b00760e62 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -235,8 +235,6 @@ The `Transformer` component sets the [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, which lets you access the transformers outputs at runtime. - - ```cli $ python -m spacy download en_core_trf_lg ``` @@ -448,7 +446,8 @@ factory = "ner" [nlp.pipeline.ner.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 3 +state_type = "ner" +extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 use_upper = false diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index 75f92070a..a31559b04 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -61,12 +61,25 @@ import Benchmarks from 'usage/\_benchmarks-models.md' - +
-The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone -our project template. +| Dependency Parsing System | UAS | LAS | +| ------------------------------------------------------------------------------ | ---: | ---: | +| spaCy RoBERTa (2020)1 | 96.8 | 95.0 | +| spaCy CNN (2020)1 | 93.7 | 91.8 | +| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 | +| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.2 | 95.7 | - +
+ +**Dependency parsing accuracy** on the Penn Treebank. See +[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more +results. **1. ** Project template: +[`benchmarks/parsing_penn_treebank`](%%GITHUB_PROJECTS/benchmarks/parsing_penn_treebank). + +
+ +
+dependency parses. spaCy's [trained pipelines](/models) include both a parser +and a trained sentence segmenter, which is +[disabled](/usage/processing-pipelines#disabling) by default. If you only need +sentence boundaries and no parser, you can use the `enable` and `disable` +arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and +disable the parser. > #### senter vs. parser > diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 3d756215f..dbf0881ac 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -253,8 +253,6 @@ different mechanisms you can use: Disabled and excluded component names can be provided to [`spacy.load`](/api/top-level#spacy.load) as a list. - - > #### 💡 Optional pipeline components > > The `disable` mechanism makes it easy to distribute pipeline packages with @@ -262,6 +260,11 @@ Disabled and excluded component names can be provided to > your pipeline may include a statistical _and_ a rule-based component for > sentence segmentation, and you can choose which one to run depending on your > use case. +> +> For example, spaCy's [trained pipelines](/models) like +> [`en_core_web_sm`](/models/en#en_core_web_sm) contain both a `parser` and +> `senter` that perform sentence segmentation, but the `senter` is disabled by +> default. ```python # Load the pipeline without the entity recognizer @@ -1501,7 +1504,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline component function and pass it the token texts from the `Doc` object received by the component. -The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very +The [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans) is very helpful here, because it takes a `Doc` object and token-based BILUO tags and returns a sequence of `Span` objects in the `Doc` with added labels. So all your wrapper has to do is compute the entity spans and overwrite the `doc.ents`. @@ -1516,14 +1519,14 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`. ```python ### {highlight="1,8-9"} import your_custom_entity_recognizer -from spacy.training import offsets_from_biluo_tags +from spacy.training import biluo_tags_to_spans from spacy.language import Language @Language.component("custom_ner_wrapper") def custom_ner_wrapper(doc): words = [token.text for token in doc] custom_entities = your_custom_entity_recognizer(words) - doc.ents = spans_from_biluo_tags(doc, custom_entities) + doc.ents = biluo_tags_to_spans(doc, custom_entities) return doc ``` diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 95e20525a..6d5746308 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -213,7 +213,7 @@ a quick web demo. It looks pretty similar to a config file used to define CI pipelines. ```yaml -https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml +%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml ``` | Section | Description | @@ -733,7 +733,10 @@ workflows, but only one can be tracked by DVC. The Prodigy integration will require a nightly version of Prodigy that supports -spaCy v3+. +spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by +exporting your data with +[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running +[`spacy convert`](/api/cli#convert) to convert it to the binary format. diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index b63145636..65afd0eb4 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -470,6 +470,7 @@ score. ```ini [training.score_weights] dep_las = 0.4 +dep_uas = null ents_f = 0.4 tag_acc = 0.2 token_acc = 0.0 @@ -481,9 +482,9 @@ you generate a config for a given pipeline, the score weights are generated by combining and normalizing the default score weights of the pipeline components. The default score weights are defined by each pipeline component via the `default_score_weights` setting on the -[`@Language.component`](/api/language#component) or -[`@Language.factory`](/api/language#factory). By default, all pipeline -components are weighted equally. +[`@Language.factory`](/api/language#factory) decorator. By default, all pipeline +components are weighted equally. If a score weight is set to `null`, it will be +excluded from the logs and the score won't be weighted. diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 406ba4b75..91d97cae2 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -88,7 +88,10 @@ import Benchmarks from 'usage/\_benchmarks-models.md' - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel), [TransformerListener](/api/architectures#TransformerListener), [Tok2VecTransformer](/api/architectures#Tok2VecTransformer) -- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf) +- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf), + [`de_dep_news_trf`](/models/de#de_dep_news_trf), + [`es_dep_news_trf`](/models/es#es_dep_news_trf), + [`fr_dep_news_trf`](/models/fr#fr_dep_news_trf) - **Implementation:** [`spacy-transformers`](https://github.com/explosion/spacy-transformers) @@ -548,17 +551,19 @@ Note that spaCy v3.0 now requires **Python 3.6+**. ### Removed or renamed API {#incompat-removed} -| Removed | Replacement | -| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | -| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | -| `GoldParse` | [`Example`](/api/example) | -| `GoldCorpus` | [`Corpus`](/api/corpus) | -| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | -| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed | -| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) | -| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | -| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | -| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated | +| Removed | Replacement | +| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | +| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) | +| `GoldParse` | [`Example`](/api/example) | +| `GoldCorpus` | [`Corpus`](/api/corpus) | +| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | +| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed | +| `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) | +| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) | +| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | +| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | +| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated | The following deprecated methods, attributes and arguments were removed in v3.0. Most of them have been **deprecated for a while** and many would previously @@ -968,16 +973,17 @@ python -m spacy package ./output ./packages #### Data utilities and gold module {#migrating-gold} -The `spacy.gold` module has been renamed to `spacy.training`. This mostly -affects internals, but if you've been using the span offset conversion utilities -[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets), -[`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or -[`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to -change your imports: +The `spacy.gold` module has been renamed to `spacy.training` and the conversion +utilities now follow the naming format of `x_to_y`. This mostly affects +internals, but if you've been using the span offset conversion utilities +[`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags), +[`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or +[`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to +change your names and imports: ```diff -- from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags -+ from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags +- from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, spans_from_biluo_tags ++ from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, biluo_tags_to_spans ``` #### Migration notes for plugin maintainers {#migrating-plugins} diff --git a/website/gatsby-config.js b/website/gatsby-config.js index 5e3b5b537..c1a2f9ab9 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -24,6 +24,7 @@ const branch = isNightly ? 'develop' : 'master' // Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY const replacements = { GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`, + GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`, } /** diff --git a/website/meta/languages.json b/website/meta/languages.json index 493f96c49..5ef3a6469 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -1,21 +1,11 @@ { "languages": [ - { - "code": "zh", - "name": "Chinese", - "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], - "dependencies": [ - { - "name": "Jieba", - "url": "https://github.com/fxsjy/jieba" - }, - { - "name": "PKUSeg", - "url": "https://github.com/lancopku/PKUSeg-python" - } - ], - "has_examples": true - }, + { "code": "af", "name": "Afrikaans" }, + { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true }, + { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true }, + { "code": "bn", "name": "Bengali", "has_examples": true }, + { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, + { "code": "cs", "name": "Czech", "has_examples": true }, { "code": "da", "name": "Danish", @@ -23,39 +13,10 @@ "has_examples": true, "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"] }, - { - "code": "nl", - "name": "Dutch", - "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], - "example": "Dit is een zin.", - "has_examples": true - }, - { - "code": "en", - "name": "English", - "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"], - "starters": [ - "en_vectors_web_lg", - "en_trf_bertbaseuncased_lg", - "en_trf_robertabase_lg", - "en_trf_distilbertbaseuncased_lg", - "en_trf_xlnetbasecased_lg" - ], - "example": "This is a sentence.", - "has_examples": true - }, - { - "code": "fr", - "name": "French", - "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"], - "example": "C'est une phrase.", - "has_examples": true - }, { "code": "de", "name": "German", - "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"], - "starters": ["de_trf_bertbasecased_lg"], + "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"], "example": "Dies ist ein Satz.", "has_examples": true }, @@ -66,6 +27,46 @@ "example": "Αυτή είναι μια πρόταση.", "has_examples": true }, + { + "code": "en", + "name": "English", + "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"], + "starters": ["en_vectors_web_lg"], + "example": "This is a sentence.", + "has_examples": true + }, + { + "code": "es", + "name": "Spanish", + "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"], + "example": "Esto es una frase.", + "has_examples": true + }, + { "code": "et", "name": "Estonian" }, + { "code": "eu", "name": "Basque", "has_examples": true }, + { "code": "fa", "name": "Persian", "has_examples": true }, + { "code": "fi", "name": "Finnish", "has_examples": true }, + { + "code": "fr", + "name": "French", + "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"], + "example": "C'est une phrase.", + "has_examples": true + }, + { "code": "ga", "name": "Irish" }, + { "code": "gu", "name": "Gujarati", "has_examples": true }, + { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true }, + { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true }, + { "code": "hr", "name": "Croatian", "has_examples": true }, + { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, + { "code": "hy", "name": "Armenian", "has_examples": true }, + { + "code": "id", + "name": "Indonesian", + "example": "Ini adalah sebuah kalimat.", + "has_examples": true + }, + { "code": "is", "name": "Icelandic" }, { "code": "it", "name": "Italian", @@ -88,12 +89,37 @@ "example": "これは文章です。", "has_examples": true }, + { "code": "kn", "name": "Kannada", "has_examples": true }, + { + "code": "ko", + "name": "Korean", + "dependencies": [ + { + "name": "mecab-ko", + "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" + }, + { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" }, + { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" } + ], + "example": "이것은 문장입니다.", + "has_examples": true + }, + { "code": "lb", "name": "Luxembourgish", "has_examples": true }, + { + "code": "lij", + "name": "Ligurian", + "example": "Sta chì a l'é unna fraxe.", + "has_examples": true + }, { "code": "lt", "name": "Lithuanian", "has_examples": true, "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"] }, + { "code": "lv", "name": "Latvian" }, + { "code": "ml", "name": "Malayalam", "has_examples": true }, + { "code": "mr", "name": "Marathi" }, { "code": "nb", "name": "Norwegian Bokmål", @@ -101,6 +127,14 @@ "has_examples": true, "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"] }, + { "code": "ne", "name": "Nepali", "has_examples": true }, + { + "code": "nl", + "name": "Dutch", + "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], + "example": "Dit is een zin.", + "has_examples": true + }, { "code": "pl", "name": "Polish", @@ -122,69 +156,26 @@ "has_examples": true, "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"] }, - { - "code": "es", - "name": "Spanish", - "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"], - "example": "Esto es una frase.", - "has_examples": true - }, - { "code": "sv", "name": "Swedish", "has_examples": true }, - { "code": "fi", "name": "Finnish", "has_examples": true }, - { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, { "code": "ru", "name": "Russian", "has_examples": true, "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] }, - { - "code": "uk", - "name": "Ukrainian", - "has_examples": true, - "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] - }, - { "code": "hr", "name": "Croatian", "has_examples": true }, - { "code": "eu", "name": "Basque", "has_examples": true }, - { "code": "yo", "name": "Yoruba", "has_examples": true }, - { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, - { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, - { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true }, - { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true }, - { "code": "fa", "name": "Persian", "has_examples": true }, - { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true }, - { "code": "tt", "name": "Tatar", "has_examples": true }, - { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true }, + { "code": "sa", "name": "Sanskrit", "has_examples": true }, { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true }, - { "code": "ga", "name": "Irish" }, - { "code": "bn", "name": "Bengali", "has_examples": true }, - { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true }, - { "code": "mr", "name": "Marathi" }, - { "code": "kn", "name": "Kannada" }, - { "code": "ta", "name": "Tamil", "has_examples": true }, - { - "code": "id", - "name": "Indonesian", - "example": "Ini adalah sebuah kalimat.", - "has_examples": true - }, - { "code": "tl", "name": "Tagalog" }, - { "code": "af", "name": "Afrikaans" }, - { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true }, - { "code": "cs", "name": "Czech" }, - { "code": "is", "name": "Icelandic" }, - { "code": "lv", "name": "Latvian" }, - { "code": "sr", "name": "Serbian" }, - { "code": "sk", "name": "Slovak" }, + { "code": "sk", "name": "Slovak", "has_examples": true }, { "code": "sl", "name": "Slovenian" }, - { "code": "lb", "name": "Luxembourgish" }, { "code": "sq", "name": "Albanian", "example": "Kjo është një fjali.", "has_examples": true }, - { "code": "et", "name": "Estonian" }, + { "code": "sr", "name": "Serbian", "has_examples": true }, + { "code": "sv", "name": "Swedish", "has_examples": true }, + { "code": "ta", "name": "Tamil", "has_examples": true }, + { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true }, { "code": "th", "name": "Thai", @@ -194,51 +185,43 @@ "example": "นี่คือประโยค", "has_examples": true }, + { "code": "tl", "name": "Tagalog" }, + { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, + { "code": "tt", "name": "Tatar", "has_examples": true }, { - "code": "ko", - "name": "Korean", - "dependencies": [ - { - "name": "mecab-ko", - "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" - }, - { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" }, - { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" } - ], - "example": "이것은 문장입니다.", - "has_examples": true + "code": "uk", + "name": "Ukrainian", + "has_examples": true, + "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] }, + { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true }, { "code": "vi", "name": "Vietnamese", "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }] }, - { - "code": "lij", - "name": "Ligurian", - "example": "Sta chì a l'é unna fraxe.", - "has_examples": true - }, - { - "code": "hy", - "name": "Armenian", - "has_examples": true - }, - { - "code": "gu", - "name": "Gujarati", - "has_examples": true - }, - { - "code": "ml", - "name": "Malayalam", - "has_examples": true - }, { "code": "xx", "name": "Multi-language", "models": ["xx_ent_wiki_sm"], "example": "This is a sentence about Facebook." + }, + { "code": "yo", "name": "Yoruba", "has_examples": true }, + { + "code": "zh", + "name": "Chinese", + "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], + "dependencies": [ + { + "name": "Jieba", + "url": "https://github.com/fxsjy/jieba" + }, + { + "name": "PKUSeg", + "url": "https://github.com/lancopku/PKUSeg-python" + } + ], + "has_examples": true } ], "licenses": [ diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js index 968b6cea8..6df8426b8 100644 --- a/website/src/components/infobox.js +++ b/website/src/components/infobox.js @@ -1,4 +1,4 @@ -import React from 'react' +import React, { Fragment } from 'react' import PropTypes from 'prop-types' import classNames from 'classnames' @@ -14,30 +14,34 @@ export default function Infobox({ className, children, }) { + const Wrapper = id ? 'div' : Fragment const infoboxClassNames = classNames(classes.root, className, { [classes.list]: !!list, [classes.warning]: variant === 'warning', [classes.danger]: variant === 'danger', }) return ( - + ) } diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 5061972b8..5d705048b 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -12,7 +12,6 @@ import Tag from '../components/tag' import { H2, Label } from '../components/typography' import Icon from '../components/icon' import Link from '../components/link' -import Grid from '../components/grid' import Infobox from '../components/infobox' import Accordion from '../components/accordion' import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' @@ -31,10 +30,16 @@ const MODEL_META = { wiki: 'Wikipedia', uas: 'Unlabelled dependencies', las: 'Labelled dependencies', + token_acc: 'Tokenization', + tok: 'Tokenization', tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', - ents_f: 'Entities (F-score)', - ents_p: 'Entities (precision)', - ents_r: 'Entities (recall)', + tag: 'Part-of-speech tags (fine grained tags, Token.tag)', + ents_f: 'Named entities (F-score)', + ents_p: 'Named entities (precision)', + ents_r: 'Named entities (recall)', + sent_f: 'Sentence segmentation (F-score)', + sent_p: 'Sentence segmentation (precision)', + sent_r: 'Sentence segmentation (recall)', cpu: 'words per second on CPU', gpu: 'words per second on GPU', pipeline: 'Active processing pipeline components in order', @@ -83,25 +88,19 @@ function formatVectors(data) { } function formatAccuracy(data) { - if (!data) return null - const labels = { - las: 'LAS', - uas: 'UAS', - tags_acc: 'TAG', - ents_f: 'NER F', - ents_p: 'NER P', - ents_r: 'NER R', - } - const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) - const isNer = key => key.startsWith('ents_') + if (!data) return [] return Object.keys(data) - .filter(key => labels[key]) - .map(key => ({ - label: labels[key], - value: data[key].toFixed(2), - help: MODEL_META[key], - type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, - })) + .map(label => { + const value = data[label] + return isNaN(value) + ? null + : { + label, + value: value.toFixed(2), + help: MODEL_META[label], + } + }) + .filter(item => item) } function formatModelMeta(data) { @@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl { label: 'Author', content: author }, { label: 'License', content: license }, ] - const accuracy = [ - { - label: 'Syntax Accuracy', - items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null, - }, - { - label: 'NER Accuracy', - items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null, - }, - ] const error = ( @@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl

) - return (

- - {accuracy && - accuracy.map(({ label, items }, i) => - !items ? null : ( - - - - - - - - {items.map((item, i) => ( - - - - - ))} - -
{label}
- - {item.value}
- ) - )} -
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)} {hasInteractiveCode && ( @@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl `import spacy`, `from spacy.lang.${langId}.examples import sentences `, ``, - `nlp = spacy.load('${name}')`, + `nlp = spacy.load("${name}")`, `doc = nlp(sentences[0])`, `print(doc.text)`, `for token in doc:`, @@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl ].join('\n')} )} + {meta.accuracy && ( + + + + {meta.accuracy.map(({ label, value, help }) => ( + + + + + + ))} + +
+ {label.toUpperCase()} + {help} + {value} +
+
+ )} {labels && (

@@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl const labelNames = labels[pipe] || [] const help = LABEL_SCHEME_META[pipe] return ( - +

- +

diff --git a/website/src/widgets/languages.js b/website/src/widgets/languages.js index bb26e57cd..74d850182 100644 --- a/website/src/widgets/languages.js +++ b/website/src/widgets/languages.js @@ -22,7 +22,7 @@ const Language = ({ name, code, models }) => ( {models && models.length ? ( - {models.length} {models.length === 1 ? 'model' : 'models'} + {models.length} {models.length === 1 ? 'package' : 'packages'} ) : ( none yet @@ -51,7 +51,7 @@ const Languages = () => ( Language Code Language Data - Models + Pipelines diff --git a/website/src/widgets/project.js b/website/src/widgets/project.js index 8d309394d..9e23d60ea 100644 --- a/website/src/widgets/project.js +++ b/website/src/widgets/project.js @@ -16,7 +16,8 @@ export default function Project({ }) { const repoArg = repo ? ` --repo ${repo}` : '' const text = `${COMMAND} ${id}${repoArg}` - const url = `${repo || projectsRepo}/${id}` + const defaultRepo = `https://github.com/${projectsRepo}` + const url = `${repo || defaultRepo}/${id}` const header = ( <> {title}:{' '}