mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
		
						commit
						921d188bce
					
				
							
								
								
									
										10
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								README.md
									
									
									
									
									
								
							|  | @ -8,12 +8,12 @@ be used in real products. | ||||||
| 
 | 
 | ||||||
| spaCy comes with | spaCy comes with | ||||||
| [pretrained pipelines](https://spacy.io/models) and vectors, and | [pretrained pipelines](https://spacy.io/models) and vectors, and | ||||||
| currently supports tokenization for **59+ languages**. It features | currently supports tokenization for **60+ languages**. It features | ||||||
| state-of-the-art speed, convolutional **neural network models** for tagging, | state-of-the-art speed, convolutional **neural network models** for tagging, | ||||||
| parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management. | parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management. | ||||||
| spaCy is commercial open-source software, released under the MIT license. | spaCy is commercial open-source software, released under the MIT license. | ||||||
| 
 | 
 | ||||||
| 💫 **Version 2.3 out now!** | 💫 **Version 3.0 out now!** | ||||||
| [Check out the release notes here.](https://github.com/explosion/spaCy/releases) | [Check out the release notes here.](https://github.com/explosion/spaCy/releases) | ||||||
| 
 | 
 | ||||||
| [](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) | [](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) | ||||||
|  | @ -30,10 +30,11 @@ spaCy is commercial open-source software, released under the MIT license. | ||||||
| ## 📖 Documentation | ## 📖 Documentation | ||||||
| 
 | 
 | ||||||
| | Documentation       |                                                                | | | Documentation       |                                                                | | ||||||
| | --------------- | -------------------------------------------------------------- | | | ------------------- | -------------------------------------------------------------- | | ||||||
| | [spaCy 101]         | New to spaCy? Here's everything you need to know!              | | | [spaCy 101]         | New to spaCy? Here's everything you need to know!              | | ||||||
| | [Usage Guides]      | How to use spaCy and its features.                             | | | [Usage Guides]      | How to use spaCy and its features.                             | | ||||||
| | [New in v3.0]       | New features, backwards incompatibilities and migration guide. | | | [New in v3.0]       | New features, backwards incompatibilities and migration guide. | | ||||||
|  | | [Project Templates] | End-to-end workflows you can clone, modify and run.            | | ||||||
| | [API Reference]     | The detailed reference for spaCy's API.                        | | | [API Reference]     | The detailed reference for spaCy's API.                        | | ||||||
| | [Models]            | Download statistical language models for spaCy.                | | | [Models]            | Download statistical language models for spaCy.                | | ||||||
| | [Universe]          | Libraries, extensions, demos, books and courses.               | | | [Universe]          | Libraries, extensions, demos, books and courses.               | | ||||||
|  | @ -46,6 +47,7 @@ spaCy is commercial open-source software, released under the MIT license. | ||||||
| [api reference]: https://spacy.io/api/ | [api reference]: https://spacy.io/api/ | ||||||
| [models]: https://spacy.io/models | [models]: https://spacy.io/models | ||||||
| [universe]: https://spacy.io/universe | [universe]: https://spacy.io/universe | ||||||
|  | [project templates]: https://github.com/explosion/projects | ||||||
| [changelog]: https://spacy.io/usage#changelog | [changelog]: https://spacy.io/usage#changelog | ||||||
| [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | ||||||
| 
 | 
 | ||||||
|  | @ -69,7 +71,7 @@ it. | ||||||
| 
 | 
 | ||||||
| ## Features | ## Features | ||||||
| 
 | 
 | ||||||
| - Support for **59+ languages** | - Support for **60+ languages** | ||||||
| - **Trained pipelines** | - **Trained pipelines** | ||||||
| - Multi-task learning with pretrained **transformers** like BERT | - Multi-task learning with pretrained **transformers** like BERT | ||||||
| - Pretrained **word vectors** | - Pretrained **word vectors** | ||||||
|  |  | ||||||
|  | @ -20,6 +20,7 @@ pytokenizations | ||||||
| setuptools | setuptools | ||||||
| packaging | packaging | ||||||
| importlib_metadata>=0.20; python_version < "3.8" | importlib_metadata>=0.20; python_version < "3.8" | ||||||
|  | typing_extensions>=3.7.4; python_version < "3.8" | ||||||
| # Development dependencies | # Development dependencies | ||||||
| cython>=0.25 | cython>=0.25 | ||||||
| pytest>=4.6.5 | pytest>=4.6.5 | ||||||
|  |  | ||||||
|  | @ -57,6 +57,7 @@ install_requires = | ||||||
|     setuptools |     setuptools | ||||||
|     packaging |     packaging | ||||||
|     importlib_metadata>=0.20; python_version < "3.8" |     importlib_metadata>=0.20; python_version < "3.8" | ||||||
|  |     typing_extensions>=3.7.4; python_version < "3.8" | ||||||
| 
 | 
 | ||||||
| [options.entry_points] | [options.entry_points] | ||||||
| console_scripts = | console_scripts = | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| # fmt: off | # fmt: off | ||||||
| __title__ = "spacy-nightly" | __title__ = "spacy-nightly" | ||||||
| __version__ = "3.0.0a20" | __version__ = "3.0.0a23" | ||||||
| __release__ = True | __release__ = True | ||||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||||
|  |  | ||||||
|  | @ -378,7 +378,7 @@ def git_sparse_checkout(repo, subpath, dest, branch): | ||||||
|         # Looking for this 'rev-list' command in the git --help? Hah. |         # Looking for this 'rev-list' command in the git --help? Hah. | ||||||
|         cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" |         cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" | ||||||
|         ret = run_command(cmd, capture=True) |         ret = run_command(cmd, capture=True) | ||||||
|         git_repo = _from_http_to_git(repo) |         git_repo = _http_to_git(repo) | ||||||
|         # Now pass those missings into another bit of git internals |         # Now pass those missings into another bit of git internals | ||||||
|         missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) |         missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) | ||||||
|         if not missings: |         if not missings: | ||||||
|  | @ -414,7 +414,7 @@ def get_git_version( | ||||||
|     return (int(version[0]), int(version[1])) |     return (int(version[0]), int(version[1])) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _from_http_to_git(repo: str) -> str: | def _http_to_git(repo: str) -> str: | ||||||
|     if repo.startswith("http://"): |     if repo.startswith("http://"): | ||||||
|         repo = repo.replace(r"http://", r"https://") |         repo = repo.replace(r"http://", r"https://") | ||||||
|     if repo.startswith(r"https://"): |     if repo.startswith(r"https://"): | ||||||
|  |  | ||||||
|  | @ -9,7 +9,7 @@ import sys | ||||||
| from ._util import app, Arg, Opt | from ._util import app, Arg, Opt | ||||||
| from ..training import docs_to_json | from ..training import docs_to_json | ||||||
| from ..tokens import DocBin | from ..tokens import DocBin | ||||||
| from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs | from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Converters are matched by file extension except for ner/iob, which are | # Converters are matched by file extension except for ner/iob, which are | ||||||
|  | @ -18,12 +18,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do | ||||||
| # imported from /converters. | # imported from /converters. | ||||||
| 
 | 
 | ||||||
| CONVERTERS = { | CONVERTERS = { | ||||||
|     "conllubio": conllu2docs, |     "conllubio": conllu_to_docs, | ||||||
|     "conllu": conllu2docs, |     "conllu": conllu_to_docs, | ||||||
|     "conll": conllu2docs, |     "conll": conllu_to_docs, | ||||||
|     "ner": conll_ner2docs, |     "ner": conll_ner_to_docs, | ||||||
|     "iob": iob2docs, |     "iob": iob_to_docs, | ||||||
|     "json": json2docs, |     "json": json_to_docs, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from wasabi import msg, table | from wasabi import msg, table | ||||||
| from thinc.api import Config | from thinc.api import Config | ||||||
| from thinc.config import VARIABLE_RE | from thinc.config import VARIABLE_RE, ConfigValidationError | ||||||
| import typer | import typer | ||||||
| 
 | 
 | ||||||
| from ._util import Arg, Opt, show_validation_error, parse_config_overrides | from ._util import Arg, Opt, show_validation_error, parse_config_overrides | ||||||
|  | @ -51,7 +51,10 @@ def debug_config( | ||||||
|     msg.divider("Config validation") |     msg.divider("Config validation") | ||||||
|     with show_validation_error(config_path): |     with show_validation_error(config_path): | ||||||
|         config = util.load_config(config_path, overrides=overrides) |         config = util.load_config(config_path, overrides=overrides) | ||||||
|         nlp, _ = util.load_model_from_config(config) |         nlp, resolved = util.load_model_from_config(config) | ||||||
|  |         # Use the resolved config here in case user has one function returning | ||||||
|  |         # a dict of corpora etc. | ||||||
|  |         check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"]) | ||||||
|     msg.good("Config is valid") |     msg.good("Config is valid") | ||||||
|     if show_vars: |     if show_vars: | ||||||
|         variables = get_variables(config) |         variables = get_variables(config) | ||||||
|  | @ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]: | ||||||
|         value = util.dot_to_object(config, path) |         value = util.dot_to_object(config, path) | ||||||
|         result[variable] = repr(value) |         result[variable] = repr(value) | ||||||
|     return result |     return result | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def check_section_refs(config: Config, fields: List[str]) -> None: | ||||||
|  |     """Validate fields in the config that refer to other sections or values | ||||||
|  |     (e.g. in the corpora) and make sure that those references exist. | ||||||
|  |     """ | ||||||
|  |     errors = [] | ||||||
|  |     for field in fields: | ||||||
|  |         # If the field doesn't exist in the config, we ignore it | ||||||
|  |         try: | ||||||
|  |             value = util.dot_to_object(config, field) | ||||||
|  |         except KeyError: | ||||||
|  |             continue | ||||||
|  |         try: | ||||||
|  |             util.dot_to_object(config, value) | ||||||
|  |         except KeyError: | ||||||
|  |             msg = f"not a valid section reference: {value}" | ||||||
|  |             errors.append({"loc": field.split("."), "msg": msg}) | ||||||
|  |     if errors: | ||||||
|  |         raise ConfigValidationError(config, errors) | ||||||
|  |  | ||||||
|  | @ -128,7 +128,7 @@ def debug_model( | ||||||
|     goldY = None |     goldY = None | ||||||
|     for e in range(3): |     for e in range(3): | ||||||
|         if tok2vec: |         if tok2vec: | ||||||
|             tok2vec.predict(X) |             tok2vec.update([Example.from_dict(x, {}) for x in X]) | ||||||
|         Y, get_dX = model.begin_update(X) |         Y, get_dX = model.begin_update(X) | ||||||
|         if goldY is None: |         if goldY is None: | ||||||
|             goldY = _simulate_gold(Y) |             goldY = _simulate_gold(Y) | ||||||
|  |  | ||||||
|  | @ -36,7 +36,7 @@ def init_config_cli( | ||||||
|     """ |     """ | ||||||
|     Generate a starter config.cfg for training. Based on your requirements |     Generate a starter config.cfg for training. Based on your requirements | ||||||
|     specified via the CLI arguments, this command generates a config with the |     specified via the CLI arguments, this command generates a config with the | ||||||
|     optimal settings for you use case. This includes the choice of architecture, |     optimal settings for your use case. This includes the choice of architecture, | ||||||
|     pretrained weights and related hyperparameters. |     pretrained weights and related hyperparameters. | ||||||
| 
 | 
 | ||||||
|     DOCS: https://nightly.spacy.io/api/cli#init-config |     DOCS: https://nightly.spacy.io/api/cli#init-config | ||||||
|  |  | ||||||
|  | @ -27,14 +27,20 @@ def project_pull_cli( | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): | def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): | ||||||
|  |     # TODO: We don't have tests for this :(. It would take a bit of mockery to | ||||||
|  |     # set up. I guess see if it breaks first? | ||||||
|     config = load_project_config(project_dir) |     config = load_project_config(project_dir) | ||||||
|     if remote in config.get("remotes", {}): |     if remote in config.get("remotes", {}): | ||||||
|         remote = config["remotes"][remote] |         remote = config["remotes"][remote] | ||||||
|     storage = RemoteStorage(project_dir, remote) |     storage = RemoteStorage(project_dir, remote) | ||||||
|     for cmd in config.get("commands", []): |     commands = list(config.get("commands", [])) | ||||||
|  |     # We use a while loop here because we don't know how the commands | ||||||
|  |     # will be ordered. A command might need dependencies from one that's later | ||||||
|  |     # in the list. | ||||||
|  |     while commands: | ||||||
|  |         for i, cmd in enumerate(list(commands)): | ||||||
|             deps = [project_dir / dep for dep in cmd.get("deps", [])] |             deps = [project_dir / dep for dep in cmd.get("deps", [])] | ||||||
|         if any(not dep.exists() for dep in deps): |             if all(dep.exists() for dep in deps): | ||||||
|             continue |  | ||||||
|                 cmd_hash = get_command_hash("", "", deps, cmd["script"]) |                 cmd_hash = get_command_hash("", "", deps, cmd["script"]) | ||||||
|                 for output_path in cmd.get("outputs", []): |                 for output_path in cmd.get("outputs", []): | ||||||
|                     url = storage.pull(output_path, command_hash=cmd_hash) |                     url = storage.pull(output_path, command_hash=cmd_hash) | ||||||
|  | @ -43,3 +49,10 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): | ||||||
|                 out_locs = [project_dir / out for out in cmd.get("outputs", [])] |                 out_locs = [project_dir / out for out in cmd.get("outputs", [])] | ||||||
|                 if all(loc.exists() for loc in out_locs): |                 if all(loc.exists() for loc in out_locs): | ||||||
|                     update_lockfile(project_dir, cmd) |                     update_lockfile(project_dir, cmd) | ||||||
|  |                 # We remove the command from the list here, and break, so that | ||||||
|  |                 # we iterate over the loop again. | ||||||
|  |                 commands.remove(i) | ||||||
|  |                 break | ||||||
|  |         else: | ||||||
|  |             # If we didn't break the for loop, break the while loop. | ||||||
|  |             break | ||||||
|  |  | ||||||
|  | @ -59,7 +59,8 @@ factory = "parser" | ||||||
| 
 | 
 | ||||||
| [components.parser.model] | [components.parser.model] | ||||||
| @architectures = "spacy.TransitionBasedParser.v1" | @architectures = "spacy.TransitionBasedParser.v1" | ||||||
| nr_feature_tokens = 8 | state_type = "parser" | ||||||
|  | extra_state_tokens = false | ||||||
| hidden_width = 128 | hidden_width = 128 | ||||||
| maxout_pieces = 3 | maxout_pieces = 3 | ||||||
| use_upper = false | use_upper = false | ||||||
|  | @ -79,7 +80,8 @@ factory = "ner" | ||||||
| 
 | 
 | ||||||
| [components.ner.model] | [components.ner.model] | ||||||
| @architectures = "spacy.TransitionBasedParser.v1" | @architectures = "spacy.TransitionBasedParser.v1" | ||||||
| nr_feature_tokens = 3 | state_type = "ner" | ||||||
|  | extra_state_tokens = false | ||||||
| hidden_width = 64 | hidden_width = 64 | ||||||
| maxout_pieces = 2 | maxout_pieces = 2 | ||||||
| use_upper = false | use_upper = false | ||||||
|  | @ -93,6 +95,49 @@ grad_factor = 1.0 | ||||||
| @layers = "reduce_mean.v1" | @layers = "reduce_mean.v1" | ||||||
| {% endif -%} | {% endif -%} | ||||||
| 
 | 
 | ||||||
|  | {% if "entity_linker" in components -%} | ||||||
|  | [components.entity_linker] | ||||||
|  | factory = "entity_linker" | ||||||
|  | get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} | ||||||
|  | incl_context = true | ||||||
|  | incl_prior = true | ||||||
|  | 
 | ||||||
|  | [components.entity_linker.model] | ||||||
|  | @architectures = "spacy.EntityLinker.v1" | ||||||
|  | nO = null | ||||||
|  | 
 | ||||||
|  | [components.entity_linker.model.tok2vec] | ||||||
|  | @architectures = "spacy-transformers.TransformerListener.v1" | ||||||
|  | grad_factor = 1.0 | ||||||
|  | 
 | ||||||
|  | [components.entity_linker.model.tok2vec.pooling] | ||||||
|  | @layers = "reduce_mean.v1" | ||||||
|  | {% endif -%} | ||||||
|  | 
 | ||||||
|  | {% if "textcat" in components %} | ||||||
|  | [components.textcat] | ||||||
|  | factory = "textcat" | ||||||
|  | 
 | ||||||
|  | {% if optimize == "accuracy" %} | ||||||
|  | [components.textcat.model] | ||||||
|  | @architectures = "spacy.TextCatEnsemble.v1" | ||||||
|  | exclusive_classes = false | ||||||
|  | width = 64 | ||||||
|  | conv_depth = 2 | ||||||
|  | embed_size = 2000 | ||||||
|  | window_size = 1 | ||||||
|  | ngram_size = 1 | ||||||
|  | nO = null | ||||||
|  | 
 | ||||||
|  | {% else -%} | ||||||
|  | [components.textcat.model] | ||||||
|  | @architectures = "spacy.TextCatBOW.v1" | ||||||
|  | exclusive_classes = false | ||||||
|  | ngram_size = 1 | ||||||
|  | no_output_layer = false | ||||||
|  | {%- endif %} | ||||||
|  | {%- endif %} | ||||||
|  | 
 | ||||||
| {# NON-TRANSFORMER PIPELINE #} | {# NON-TRANSFORMER PIPELINE #} | ||||||
| {% else -%} | {% else -%} | ||||||
| 
 | 
 | ||||||
|  | @ -140,7 +185,8 @@ factory = "parser" | ||||||
| 
 | 
 | ||||||
| [components.parser.model] | [components.parser.model] | ||||||
| @architectures = "spacy.TransitionBasedParser.v1" | @architectures = "spacy.TransitionBasedParser.v1" | ||||||
| nr_feature_tokens = 8 | state_type = "parser" | ||||||
|  | extra_state_tokens = false | ||||||
| hidden_width = 128 | hidden_width = 128 | ||||||
| maxout_pieces = 3 | maxout_pieces = 3 | ||||||
| use_upper = true | use_upper = true | ||||||
|  | @ -157,7 +203,8 @@ factory = "ner" | ||||||
| 
 | 
 | ||||||
| [components.ner.model] | [components.ner.model] | ||||||
| @architectures = "spacy.TransitionBasedParser.v1" | @architectures = "spacy.TransitionBasedParser.v1" | ||||||
| nr_feature_tokens = 6 | state_type = "ner" | ||||||
|  | extra_state_tokens = false | ||||||
| hidden_width = 64 | hidden_width = 64 | ||||||
| maxout_pieces = 2 | maxout_pieces = 2 | ||||||
| use_upper = true | use_upper = true | ||||||
|  | @ -167,10 +214,50 @@ nO = null | ||||||
| @architectures = "spacy.Tok2VecListener.v1" | @architectures = "spacy.Tok2VecListener.v1" | ||||||
| width = ${components.tok2vec.model.encode.width} | width = ${components.tok2vec.model.encode.width} | ||||||
| {% endif %} | {% endif %} | ||||||
|  | 
 | ||||||
|  | {% if "entity_linker" in components -%} | ||||||
|  | [components.entity_linker] | ||||||
|  | factory = "entity_linker" | ||||||
|  | get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} | ||||||
|  | incl_context = true | ||||||
|  | incl_prior = true | ||||||
|  | 
 | ||||||
|  | [components.entity_linker.model] | ||||||
|  | @architectures = "spacy.EntityLinker.v1" | ||||||
|  | nO = null | ||||||
|  | 
 | ||||||
|  | [components.entity_linker.model.tok2vec] | ||||||
|  | @architectures = "spacy.Tok2VecListener.v1" | ||||||
|  | width = ${components.tok2vec.model.encode.width} | ||||||
|  | {% endif %} | ||||||
|  | 
 | ||||||
|  | {% if "textcat" in components %} | ||||||
|  | [components.textcat] | ||||||
|  | factory = "textcat" | ||||||
|  | 
 | ||||||
|  | {% if optimize == "accuracy" %} | ||||||
|  | [components.textcat.model] | ||||||
|  | @architectures = "spacy.TextCatEnsemble.v1" | ||||||
|  | exclusive_classes = false | ||||||
|  | width = 64 | ||||||
|  | conv_depth = 2 | ||||||
|  | embed_size = 2000 | ||||||
|  | window_size = 1 | ||||||
|  | ngram_size = 1 | ||||||
|  | nO = null | ||||||
|  | 
 | ||||||
|  | {% else -%} | ||||||
|  | [components.textcat.model] | ||||||
|  | @architectures = "spacy.TextCatBOW.v1" | ||||||
|  | exclusive_classes = false | ||||||
|  | ngram_size = 1 | ||||||
|  | no_output_layer = false | ||||||
|  | {%- endif %} | ||||||
|  | {%- endif %} | ||||||
| {% endif %} | {% endif %} | ||||||
| 
 | 
 | ||||||
| {% for pipe in components %} | {% for pipe in components %} | ||||||
| {% if pipe not in ["tagger", "parser", "ner"] %} | {% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %} | ||||||
| {# Other components defined by the user: we just assume they're factories #} | {# Other components defined by the user: we just assume they're factories #} | ||||||
| [components.{{ pipe }}] | [components.{{ pipe }}] | ||||||
| factory = "{{ pipe }}" | factory = "{{ pipe }}" | ||||||
|  | @ -197,7 +284,7 @@ vectors = "{{ word_vectors }}" | ||||||
| {% endif -%} | {% endif -%} | ||||||
| {% if use_transformer -%} | {% if use_transformer -%} | ||||||
| accumulate_gradient = {{ transformer["size_factor"] }} | accumulate_gradient = {{ transformer["size_factor"] }} | ||||||
| {% endif %} | {% endif -%} | ||||||
| dev_corpus = "corpora.dev" | dev_corpus = "corpora.dev" | ||||||
| train_corpus = "corpora.train" | train_corpus = "corpora.train" | ||||||
| 
 | 
 | ||||||
|  | @ -230,18 +317,3 @@ start = 100 | ||||||
| stop = 1000 | stop = 1000 | ||||||
| compound = 1.001 | compound = 1.001 | ||||||
| {% endif %} | {% endif %} | ||||||
| 
 |  | ||||||
| [training.score_weights] |  | ||||||
| {%- if "tagger" in components %} |  | ||||||
| tag_acc = {{ (1.0 / components|length)|round(2) }} |  | ||||||
| {%- endif -%} |  | ||||||
| {%- if "parser" in components %} |  | ||||||
| dep_uas = 0.0 |  | ||||||
| dep_las = {{ (1.0 / components|length)|round(2) }} |  | ||||||
| sents_f = 0.0 |  | ||||||
| {%- endif %} |  | ||||||
| {%- if "ner" in components %} |  | ||||||
| ents_f = {{ (1.0 / components|length)|round(2) }} |  | ||||||
| ents_p = 0.0 |  | ||||||
| ents_r = 0.0 |  | ||||||
| {%- endif -%} |  | ||||||
|  |  | ||||||
|  | @ -152,6 +152,7 @@ def train( | ||||||
|         exclude=frozen_components, |         exclude=frozen_components, | ||||||
|     ) |     ) | ||||||
|     msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") |     msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") | ||||||
|  |     with nlp.select_pipes(disable=frozen_components): | ||||||
|         print_row, finalize_logger = train_logger(nlp) |         print_row, finalize_logger = train_logger(nlp) | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|  | @ -163,6 +164,7 @@ def train( | ||||||
|                 progress.close() |                 progress.close() | ||||||
|                 print_row(info) |                 print_row(info) | ||||||
|                 if is_best_checkpoint and output_path is not None: |                 if is_best_checkpoint and output_path is not None: | ||||||
|  |                     with nlp.select_pipes(disable=frozen_components): | ||||||
|                         update_meta(T_cfg, nlp, info) |                         update_meta(T_cfg, nlp, info) | ||||||
|                     with nlp.use_params(optimizer.averages): |                     with nlp.use_params(optimizer.averages): | ||||||
|                         nlp.to_disk(output_path / "model-best") |                         nlp.to_disk(output_path / "model-best") | ||||||
|  | @ -207,10 +209,17 @@ def create_train_batches(iterator, batcher, max_epochs: int): | ||||||
| def create_evaluation_callback( | def create_evaluation_callback( | ||||||
|     nlp: Language, dev_corpus: Callable, weights: Dict[str, float] |     nlp: Language, dev_corpus: Callable, weights: Dict[str, float] | ||||||
| ) -> Callable[[], Tuple[float, Dict[str, float]]]: | ) -> Callable[[], Tuple[float, Dict[str, float]]]: | ||||||
|  |     weights = {key: value for key, value in weights.items() if value is not None} | ||||||
|  | 
 | ||||||
|     def evaluate() -> Tuple[float, Dict[str, float]]: |     def evaluate() -> Tuple[float, Dict[str, float]]: | ||||||
|         dev_examples = list(dev_corpus(nlp)) |         dev_examples = list(dev_corpus(nlp)) | ||||||
|         scores = nlp.evaluate(dev_examples) |         scores = nlp.evaluate(dev_examples) | ||||||
|         # Calculate a weighted sum based on score_weights for the main score |         # Calculate a weighted sum based on score_weights for the main score. | ||||||
|  |         # We can only consider scores that are ints/floats, not dicts like | ||||||
|  |         # entity scores per type etc. | ||||||
|  |         for key, value in scores.items(): | ||||||
|  |             if key in weights and not isinstance(value, (int, float)): | ||||||
|  |                 raise ValueError(Errors.E915.format(name=key, score_type=type(value))) | ||||||
|         try: |         try: | ||||||
|             weighted_score = sum( |             weighted_score = sum( | ||||||
|                 scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights |                 scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights | ||||||
|  | @ -366,6 +375,7 @@ def update_meta( | ||||||
| ) -> None: | ) -> None: | ||||||
|     nlp.meta["performance"] = {} |     nlp.meta["performance"] = {} | ||||||
|     for metric in training["score_weights"]: |     for metric in training["score_weights"]: | ||||||
|  |         if metric is not None: | ||||||
|             nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) |             nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) | ||||||
|     for pipe_name in nlp.pipe_names: |     for pipe_name in nlp.pipe_names: | ||||||
|         nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] |         nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] | ||||||
|  |  | ||||||
|  | @ -22,6 +22,11 @@ try: | ||||||
| except ImportError: | except ImportError: | ||||||
|     cupy = None |     cupy = None | ||||||
| 
 | 
 | ||||||
|  | try:  # Python 3.8+ | ||||||
|  |     from typing import Literal | ||||||
|  | except ImportError: | ||||||
|  |     from typing_extensions import Literal  # noqa: F401 | ||||||
|  | 
 | ||||||
| from thinc.api import Optimizer  # noqa: F401 | from thinc.api import Optimizer  # noqa: F401 | ||||||
| 
 | 
 | ||||||
| pickle = pickle | pickle = pickle | ||||||
|  |  | ||||||
|  | @ -69,7 +69,7 @@ class Warnings: | ||||||
|             "in problems with the vocab further on in the pipeline.") |             "in problems with the vocab further on in the pipeline.") | ||||||
|     W030 = ("Some entities could not be aligned in the text \"{text}\" with " |     W030 = ("Some entities could not be aligned in the text \"{text}\" with " | ||||||
|             "entities \"{entities}\". Use " |             "entities \"{entities}\". Use " | ||||||
|             "`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" |             "`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`" | ||||||
|             " to check the alignment. Misaligned entities ('-') will be " |             " to check the alignment. Misaligned entities ('-') will be " | ||||||
|             "ignored during training.") |             "ignored during training.") | ||||||
|     W033 = ("Training a new {model} using a model with no lexeme normalization " |     W033 = ("Training a new {model} using a model with no lexeme normalization " | ||||||
|  | @ -480,6 +480,13 @@ class Errors: | ||||||
|     E201 = ("Span index out of range.") |     E201 = ("Span index out of range.") | ||||||
| 
 | 
 | ||||||
|     # TODO: fix numbering after merging develop into master |     # TODO: fix numbering after merging develop into master | ||||||
|  |     E915 = ("Can't use score '{name}' to calculate final weighted score. Expected " | ||||||
|  |             "float or int but got: {score_type}. To exclude the score from the " | ||||||
|  |             "final score, set its weight to null in the [training.score_weights] " | ||||||
|  |             "section of your training config.") | ||||||
|  |     E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})") | ||||||
|  |     E917 = ("Received invalid value {value} for 'state_type' in " | ||||||
|  |             "TransitionBasedParser: only 'parser' or 'ner' are valid options.") | ||||||
|     E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid " |     E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid " | ||||||
|             "values are an instance of spacy.vocab.Vocab or True to create one" |             "values are an instance of spacy.vocab.Vocab or True to create one" | ||||||
|             " (default).") |             " (default).") | ||||||
|  |  | ||||||
|  | @ -140,7 +140,6 @@ cdef class KnowledgeBase: | ||||||
|         self._entries.push_back(entry) |         self._entries.push_back(entry) | ||||||
|         self._aliases_table.push_back(alias) |         self._aliases_table.push_back(alias) | ||||||
| 
 | 
 | ||||||
|     cpdef from_disk(self, loc) |  | ||||||
|     cpdef set_entities(self, entity_list, freq_list, vector_list) |     cpdef set_entities(self, entity_list, freq_list, vector_list) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										47
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							
							
						
						
									
										47
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							|  | @ -9,7 +9,8 @@ from libcpp.vector cimport vector | ||||||
| 
 | 
 | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import warnings | import warnings | ||||||
| from os import path | 
 | ||||||
|  | from spacy import util | ||||||
| 
 | 
 | ||||||
| from .typedefs cimport hash_t | from .typedefs cimport hash_t | ||||||
| from .errors import Errors, Warnings | from .errors import Errors, Warnings | ||||||
|  | @ -319,8 +320,14 @@ cdef class KnowledgeBase: | ||||||
|         return 0.0 |         return 0.0 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def to_disk(self, loc): |     def to_disk(self, path): | ||||||
|         cdef Writer writer = Writer(loc) |         path = util.ensure_path(path) | ||||||
|  |         if path.is_dir(): | ||||||
|  |             raise ValueError(Errors.E928.format(loc=path)) | ||||||
|  |         if not path.parent.exists(): | ||||||
|  |             path.parent.mkdir(parents=True) | ||||||
|  | 
 | ||||||
|  |         cdef Writer writer = Writer(path) | ||||||
|         writer.write_header(self.get_size_entities(), self.entity_vector_length) |         writer.write_header(self.get_size_entities(), self.entity_vector_length) | ||||||
| 
 | 
 | ||||||
|         # dumping the entity vectors in their original order |         # dumping the entity vectors in their original order | ||||||
|  | @ -359,7 +366,13 @@ cdef class KnowledgeBase: | ||||||
| 
 | 
 | ||||||
|         writer.close() |         writer.close() | ||||||
| 
 | 
 | ||||||
|     cpdef from_disk(self, loc): |     def from_disk(self, path): | ||||||
|  |         path = util.ensure_path(path) | ||||||
|  |         if path.is_dir(): | ||||||
|  |             raise ValueError(Errors.E928.format(loc=path)) | ||||||
|  |         if not path.exists(): | ||||||
|  |             raise ValueError(Errors.E929.format(loc=path)) | ||||||
|  | 
 | ||||||
|         cdef hash_t entity_hash |         cdef hash_t entity_hash | ||||||
|         cdef hash_t alias_hash |         cdef hash_t alias_hash | ||||||
|         cdef int64_t entry_index |         cdef int64_t entry_index | ||||||
|  | @ -369,7 +382,7 @@ cdef class KnowledgeBase: | ||||||
|         cdef AliasC alias |         cdef AliasC alias | ||||||
|         cdef float vector_element |         cdef float vector_element | ||||||
| 
 | 
 | ||||||
|         cdef Reader reader = Reader(loc) |         cdef Reader reader = Reader(path) | ||||||
| 
 | 
 | ||||||
|         # STEP 0: load header and initialize KB |         # STEP 0: load header and initialize KB | ||||||
|         cdef int64_t nr_entities |         cdef int64_t nr_entities | ||||||
|  | @ -450,16 +463,13 @@ cdef class KnowledgeBase: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Writer: | cdef class Writer: | ||||||
|     def __init__(self, object loc): |     def __init__(self, path): | ||||||
|         if isinstance(loc, Path): |         assert isinstance(path, Path) | ||||||
|             loc = bytes(loc) |         content = bytes(path) | ||||||
|         if path.exists(loc): |         cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content | ||||||
|             if path.isdir(loc): |  | ||||||
|                 raise ValueError(Errors.E928.format(loc=loc)) |  | ||||||
|         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc |  | ||||||
|         self._fp = fopen(<char*>bytes_loc, 'wb') |         self._fp = fopen(<char*>bytes_loc, 'wb') | ||||||
|         if not self._fp: |         if not self._fp: | ||||||
|             raise IOError(Errors.E146.format(path=loc)) |             raise IOError(Errors.E146.format(path=path)) | ||||||
|         fseek(self._fp, 0, 0) |         fseek(self._fp, 0, 0) | ||||||
| 
 | 
 | ||||||
|     def close(self): |     def close(self): | ||||||
|  | @ -496,14 +506,9 @@ cdef class Writer: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Reader: | cdef class Reader: | ||||||
|     def __init__(self, object loc): |     def __init__(self, path): | ||||||
|         if isinstance(loc, Path): |         content = bytes(path) | ||||||
|             loc = bytes(loc) |         cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content | ||||||
|         if not path.exists(loc): |  | ||||||
|             raise ValueError(Errors.E929.format(loc=loc)) |  | ||||||
|         if path.isdir(loc): |  | ||||||
|             raise ValueError(Errors.E928.format(loc=loc)) |  | ||||||
|         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc |  | ||||||
|         self._fp = fopen(<char*>bytes_loc, 'rb') |         self._fp = fopen(<char*>bytes_loc, 'rb') | ||||||
|         if not self._fp: |         if not self._fp: | ||||||
|             PyErr_SetFromErrno(IOError) |             PyErr_SetFromErrno(IOError) | ||||||
|  |  | ||||||
|  | @ -25,7 +25,6 @@ class Bengali(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "rule", "lookups": None}, |     default_config={"model": None, "mode": "rule", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -30,7 +30,6 @@ class Greek(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "rule", "lookups": None}, |     default_config={"model": None, "mode": "rule", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -29,7 +29,6 @@ class English(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "rule", "lookups": None}, |     default_config={"model": None, "mode": "rule", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -28,7 +28,6 @@ class Persian(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "rule", "lookups": None}, |     default_config={"model": None, "mode": "rule", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -33,7 +33,6 @@ class French(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "rule", "lookups": None}, |     default_config={"model": None, "mode": "rule", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -28,7 +28,6 @@ class Norwegian(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "rule", "lookups": None}, |     default_config={"model": None, "mode": "rule", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -30,7 +30,6 @@ class Dutch(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "rule", "lookups": None}, |     default_config={"model": None, "mode": "rule", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -35,7 +35,6 @@ class Polish(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "pos_lookup", "lookups": None}, |     default_config={"model": None, "mode": "pos_lookup", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -25,7 +25,6 @@ class Russian(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "pymorphy2", "lookups": None}, |     default_config={"model": None, "mode": "pymorphy2", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -31,7 +31,6 @@ class Swedish(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "rule", "lookups": None}, |     default_config={"model": None, "mode": "rule", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -25,7 +25,6 @@ class Ukrainian(Language): | ||||||
|     "lemmatizer", |     "lemmatizer", | ||||||
|     assigns=["token.lemma"], |     assigns=["token.lemma"], | ||||||
|     default_config={"model": None, "mode": "pymorphy2", "lookups": None}, |     default_config={"model": None, "mode": "pymorphy2", "lookups": None}, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -248,8 +248,11 @@ class Language: | ||||||
|         self._config["nlp"]["pipeline"] = list(self.component_names) |         self._config["nlp"]["pipeline"] = list(self.component_names) | ||||||
|         self._config["nlp"]["disabled"] = list(self.disabled) |         self._config["nlp"]["disabled"] = list(self.disabled) | ||||||
|         self._config["components"] = pipeline |         self._config["components"] = pipeline | ||||||
|         if not self._config["training"].get("score_weights"): |         # We're merging the existing score weights back into the combined | ||||||
|             combined_score_weights = combine_score_weights(score_weights) |         # weights to make sure we're preserving custom settings in the config | ||||||
|  |         # but also reflect updates (e.g. new components added) | ||||||
|  |         prev_weights = self._config["training"].get("score_weights", {}) | ||||||
|  |         combined_score_weights = combine_score_weights(score_weights, prev_weights) | ||||||
|         self._config["training"]["score_weights"] = combined_score_weights |         self._config["training"]["score_weights"] = combined_score_weights | ||||||
|         if not srsly.is_json_serializable(self._config): |         if not srsly.is_json_serializable(self._config): | ||||||
|             raise ValueError(Errors.E961.format(config=self._config)) |             raise ValueError(Errors.E961.format(config=self._config)) | ||||||
|  | @ -412,7 +415,6 @@ class Language: | ||||||
|         assigns: Iterable[str] = SimpleFrozenList(), |         assigns: Iterable[str] = SimpleFrozenList(), | ||||||
|         requires: Iterable[str] = SimpleFrozenList(), |         requires: Iterable[str] = SimpleFrozenList(), | ||||||
|         retokenizes: bool = False, |         retokenizes: bool = False, | ||||||
|         scores: Iterable[str] = SimpleFrozenList(), |  | ||||||
|         default_score_weights: Dict[str, float] = SimpleFrozenDict(), |         default_score_weights: Dict[str, float] = SimpleFrozenDict(), | ||||||
|         func: Optional[Callable] = None, |         func: Optional[Callable] = None, | ||||||
|     ) -> Callable: |     ) -> Callable: | ||||||
|  | @ -430,12 +432,11 @@ class Language: | ||||||
|             e.g. "token.ent_id". Used for pipeline analyis. |             e.g. "token.ent_id". Used for pipeline analyis. | ||||||
|         retokenizes (bool): Whether the component changes the tokenization. |         retokenizes (bool): Whether the component changes the tokenization. | ||||||
|             Used for pipeline analysis. |             Used for pipeline analysis. | ||||||
|         scores (Iterable[str]): All scores set by the component if it's trainable, |  | ||||||
|             e.g. ["ents_f", "ents_r", "ents_p"]. |  | ||||||
|         default_score_weights (Dict[str, float]): The scores to report during |         default_score_weights (Dict[str, float]): The scores to report during | ||||||
|             training, and their default weight towards the final score used to |             training, and their default weight towards the final score used to | ||||||
|             select the best model. Weights should sum to 1.0 per component and |             select the best model. Weights should sum to 1.0 per component and | ||||||
|             will be combined and normalized for the whole pipeline. |             will be combined and normalized for the whole pipeline. If None, | ||||||
|  |             the score won't be shown in the logs or be weighted. | ||||||
|         func (Optional[Callable]): Factory function if not used as a decorator. |         func (Optional[Callable]): Factory function if not used as a decorator. | ||||||
| 
 | 
 | ||||||
|         DOCS: https://nightly.spacy.io/api/language#factory |         DOCS: https://nightly.spacy.io/api/language#factory | ||||||
|  | @ -475,7 +476,7 @@ class Language: | ||||||
|                 default_config=default_config, |                 default_config=default_config, | ||||||
|                 assigns=validate_attrs(assigns), |                 assigns=validate_attrs(assigns), | ||||||
|                 requires=validate_attrs(requires), |                 requires=validate_attrs(requires), | ||||||
|                 scores=scores, |                 scores=list(default_score_weights.keys()), | ||||||
|                 default_score_weights=default_score_weights, |                 default_score_weights=default_score_weights, | ||||||
|                 retokenizes=retokenizes, |                 retokenizes=retokenizes, | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|  | @ -2,6 +2,8 @@ from typing import Optional, List | ||||||
| from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops | from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops | ||||||
| from thinc.types import Floats2d | from thinc.types import Floats2d | ||||||
| 
 | 
 | ||||||
|  | from ...errors import Errors | ||||||
|  | from ...compat import Literal | ||||||
| from ...util import registry | from ...util import registry | ||||||
| from .._precomputable_affine import PrecomputableAffine | from .._precomputable_affine import PrecomputableAffine | ||||||
| from ..tb_framework import TransitionModel | from ..tb_framework import TransitionModel | ||||||
|  | @ -11,7 +13,8 @@ from ...tokens import Doc | ||||||
| @registry.architectures.register("spacy.TransitionBasedParser.v1") | @registry.architectures.register("spacy.TransitionBasedParser.v1") | ||||||
| def build_tb_parser_model( | def build_tb_parser_model( | ||||||
|     tok2vec: Model[List[Doc], List[Floats2d]], |     tok2vec: Model[List[Doc], List[Floats2d]], | ||||||
|     nr_feature_tokens: int, |     state_type: Literal["parser", "ner"], | ||||||
|  |     extra_state_tokens: bool, | ||||||
|     hidden_width: int, |     hidden_width: int, | ||||||
|     maxout_pieces: int, |     maxout_pieces: int, | ||||||
|     use_upper: bool = True, |     use_upper: bool = True, | ||||||
|  | @ -40,20 +43,12 @@ def build_tb_parser_model( | ||||||
| 
 | 
 | ||||||
|     tok2vec (Model[List[Doc], List[Floats2d]]): |     tok2vec (Model[List[Doc], List[Floats2d]]): | ||||||
|         Subnetwork to map tokens into vector representations. |         Subnetwork to map tokens into vector representations. | ||||||
|     nr_feature_tokens (int): The number of tokens in the context to use to |     state_type (str): | ||||||
|         construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The |         String value denoting the type of parser model: "parser" or "ner" | ||||||
|         2, 8 and 13 feature sets are designed for the parser, while the 3 and 6 |     extra_state_tokens (bool): Whether or not to use additional tokens in the context | ||||||
|         feature sets are designed for the NER. The recommended feature sets are |         to construct the state vector. Defaults to `False`, which means 3 and 8 | ||||||
|         3 for NER, and 8 for the dependency parser. |         for the NER and parser respectively. When set to `True`, this would become 6 | ||||||
| 
 |         feature sets (for the NER) or 13 (for the parser). | ||||||
|         TODO: This feature should be split into two, state_type: ["deps", "ner"] |  | ||||||
|         and extra_state_features: [True, False]. This would map into: |  | ||||||
| 
 |  | ||||||
|         (deps, False): 8 |  | ||||||
|         (deps, True): 13 |  | ||||||
|         (ner, False): 3 |  | ||||||
|         (ner, True): 6 |  | ||||||
| 
 |  | ||||||
|     hidden_width (int): The width of the hidden layer. |     hidden_width (int): The width of the hidden layer. | ||||||
|     maxout_pieces (int): How many pieces to use in the state prediction layer. |     maxout_pieces (int): How many pieces to use in the state prediction layer. | ||||||
|         Recommended values are 1, 2 or 3. If 1, the maxout non-linearity |         Recommended values are 1, 2 or 3. If 1, the maxout non-linearity | ||||||
|  | @ -68,8 +63,14 @@ def build_tb_parser_model( | ||||||
|         Usually inferred from data at the beginning of training, or loaded from |         Usually inferred from data at the beginning of training, or loaded from | ||||||
|         disk. |         disk. | ||||||
|     """ |     """ | ||||||
|  |     if state_type == "parser": | ||||||
|  |         nr_feature_tokens = 13 if extra_state_tokens else 8 | ||||||
|  |     elif state_type == "ner": | ||||||
|  |         nr_feature_tokens = 6 if extra_state_tokens else 3 | ||||||
|  |     else: | ||||||
|  |         raise ValueError(Errors.E917.format(value=state_type)) | ||||||
|     t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None |     t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None | ||||||
|     tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) |     tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) | ||||||
|     tok2vec.set_dim("nO", hidden_width) |     tok2vec.set_dim("nO", hidden_width) | ||||||
|     lower = PrecomputableAffine( |     lower = PrecomputableAffine( | ||||||
|         nO=hidden_width if use_upper else nO, |         nO=hidden_width if use_upper else nO, | ||||||
|  |  | ||||||
|  | @ -15,7 +15,8 @@ from ..training import validate_examples | ||||||
| default_model_config = """ | default_model_config = """ | ||||||
| [model] | [model] | ||||||
| @architectures = "spacy.TransitionBasedParser.v1" | @architectures = "spacy.TransitionBasedParser.v1" | ||||||
| nr_feature_tokens = 8 | state_type = "parser" | ||||||
|  | extra_state_tokens = false | ||||||
| hidden_width = 64 | hidden_width = 64 | ||||||
| maxout_pieces = 2 | maxout_pieces = 2 | ||||||
| 
 | 
 | ||||||
|  | @ -42,8 +43,14 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] | ||||||
|         "min_action_freq": 30, |         "min_action_freq": 30, | ||||||
|         "model": DEFAULT_PARSER_MODEL, |         "model": DEFAULT_PARSER_MODEL, | ||||||
|     }, |     }, | ||||||
|     scores=["dep_uas", "dep_las", "dep_las_per_type", "sents_p", "sents_r", "sents_f"], |     default_score_weights={ | ||||||
|     default_score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0}, |         "dep_uas": 0.5, | ||||||
|  |         "dep_las": 0.5, | ||||||
|  |         "dep_las_per_type": None, | ||||||
|  |         "sents_p": None, | ||||||
|  |         "sents_r": None, | ||||||
|  |         "sents_f": 0.0, | ||||||
|  |     }, | ||||||
| ) | ) | ||||||
| def make_parser( | def make_parser( | ||||||
|     nlp: Language, |     nlp: Language, | ||||||
|  |  | ||||||
|  | @ -25,8 +25,12 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] | ||||||
|         "overwrite_ents": False, |         "overwrite_ents": False, | ||||||
|         "ent_id_sep": DEFAULT_ENT_ID_SEP, |         "ent_id_sep": DEFAULT_ENT_ID_SEP, | ||||||
|     }, |     }, | ||||||
|     scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], |     default_score_weights={ | ||||||
|     default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, |         "ents_f": 1.0, | ||||||
|  |         "ents_p": 0.0, | ||||||
|  |         "ents_r": 0.0, | ||||||
|  |         "ents_per_type": None, | ||||||
|  |     }, | ||||||
| ) | ) | ||||||
| def make_entity_ruler( | def make_entity_ruler( | ||||||
|     nlp: Language, |     nlp: Language, | ||||||
|  |  | ||||||
|  | @ -21,7 +21,6 @@ from .. import util | ||||||
|         "lookups": None, |         "lookups": None, | ||||||
|         "overwrite": False, |         "overwrite": False, | ||||||
|     }, |     }, | ||||||
|     scores=["lemma_acc"], |  | ||||||
|     default_score_weights={"lemma_acc": 1.0}, |     default_score_weights={"lemma_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_lemmatizer( | def make_lemmatizer( | ||||||
|  |  | ||||||
|  | @ -49,8 +49,7 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] | ||||||
|     "morphologizer", |     "morphologizer", | ||||||
|     assigns=["token.morph", "token.pos"], |     assigns=["token.morph", "token.pos"], | ||||||
|     default_config={"model": DEFAULT_MORPH_MODEL}, |     default_config={"model": DEFAULT_MORPH_MODEL}, | ||||||
|     scores=["pos_acc", "morph_acc", "morph_per_feat"], |     default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, | ||||||
|     default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5}, |  | ||||||
| ) | ) | ||||||
| def make_morphologizer( | def make_morphologizer( | ||||||
|     nlp: Language, |     nlp: Language, | ||||||
|  |  | ||||||
|  | @ -13,7 +13,8 @@ from ..training import validate_examples | ||||||
| default_model_config = """ | default_model_config = """ | ||||||
| [model] | [model] | ||||||
| @architectures = "spacy.TransitionBasedParser.v1" | @architectures = "spacy.TransitionBasedParser.v1" | ||||||
| nr_feature_tokens = 6 | state_type = "ner" | ||||||
|  | extra_state_tokens = false | ||||||
| hidden_width = 64 | hidden_width = 64 | ||||||
| maxout_pieces = 2 | maxout_pieces = 2 | ||||||
| 
 | 
 | ||||||
|  | @ -38,8 +39,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] | ||||||
|         "update_with_oracle_cut_size": 100, |         "update_with_oracle_cut_size": 100, | ||||||
|         "model": DEFAULT_NER_MODEL, |         "model": DEFAULT_NER_MODEL, | ||||||
|     }, |     }, | ||||||
|     scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], |     default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, | ||||||
|     default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, |  | ||||||
| 
 | 
 | ||||||
| ) | ) | ||||||
| def make_ner( | def make_ner( | ||||||
|  |  | ||||||
|  | @ -15,7 +15,6 @@ from .. import util | ||||||
|     "sentencizer", |     "sentencizer", | ||||||
|     assigns=["token.is_sent_start", "doc.sents"], |     assigns=["token.is_sent_start", "doc.sents"], | ||||||
|     default_config={"punct_chars": None}, |     default_config={"punct_chars": None}, | ||||||
|     scores=["sents_p", "sents_r", "sents_f"], |  | ||||||
|     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, |     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, | ||||||
| ) | ) | ||||||
| def make_sentencizer( | def make_sentencizer( | ||||||
|  |  | ||||||
|  | @ -36,7 +36,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] | ||||||
|     "senter", |     "senter", | ||||||
|     assigns=["token.is_sent_start"], |     assigns=["token.is_sent_start"], | ||||||
|     default_config={"model": DEFAULT_SENTER_MODEL}, |     default_config={"model": DEFAULT_SENTER_MODEL}, | ||||||
|     scores=["sents_p", "sents_r", "sents_f"], |  | ||||||
|     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, |     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, | ||||||
| ) | ) | ||||||
| def make_senter(nlp: Language, name: str, model: Model): | def make_senter(nlp: Language, name: str, model: Model): | ||||||
|  |  | ||||||
|  | @ -42,7 +42,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] | ||||||
|     "tagger", |     "tagger", | ||||||
|     assigns=["token.tag"], |     assigns=["token.tag"], | ||||||
|     default_config={"model": DEFAULT_TAGGER_MODEL}, |     default_config={"model": DEFAULT_TAGGER_MODEL}, | ||||||
|     scores=["tag_acc"], |  | ||||||
|     default_score_weights={"tag_acc": 1.0}, |     default_score_weights={"tag_acc": 1.0}, | ||||||
| ) | ) | ||||||
| def make_tagger(nlp: Language, name: str, model: Model): | def make_tagger(nlp: Language, name: str, model: Model): | ||||||
|  |  | ||||||
|  | @ -62,18 +62,17 @@ subword_features = true | ||||||
|         "positive_label": None, |         "positive_label": None, | ||||||
|         "model": DEFAULT_TEXTCAT_MODEL, |         "model": DEFAULT_TEXTCAT_MODEL, | ||||||
|     }, |     }, | ||||||
|     scores=[ |     default_score_weights={ | ||||||
|         "cats_score", |         "cats_score": 1.0, | ||||||
|         "cats_score_desc", |         "cats_score_desc": None, | ||||||
|         "cats_p", |         "cats_p": None, | ||||||
|         "cats_r", |         "cats_r": None, | ||||||
|         "cats_f", |         "cats_f": None, | ||||||
|         "cats_macro_f", |         "cats_macro_f": None, | ||||||
|         "cats_macro_auc", |         "cats_macro_auc": None, | ||||||
|         "cats_f_per_type", |         "cats_f_per_type": None, | ||||||
|         "cats_macro_auc_per_type", |         "cats_macro_auc_per_type": None, | ||||||
|     ], |     }, | ||||||
|     default_score_weights={"cats_score": 1.0}, |  | ||||||
| ) | ) | ||||||
| def make_textcat( | def make_textcat( | ||||||
|     nlp: Language, |     nlp: Language, | ||||||
|  |  | ||||||
|  | @ -127,7 +127,7 @@ class Tok2Vec(Pipe): | ||||||
|         tokvecs = self.model.predict(docs) |         tokvecs = self.model.predict(docs) | ||||||
|         batch_id = Tok2VecListener.get_batch_id(docs) |         batch_id = Tok2VecListener.get_batch_id(docs) | ||||||
|         for listener in self.listeners: |         for listener in self.listeners: | ||||||
|             listener.receive(batch_id, tokvecs, None) |             listener.receive(batch_id, tokvecs, lambda dX: []) | ||||||
|         return tokvecs |         return tokvecs | ||||||
| 
 | 
 | ||||||
|     def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None: |     def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None: | ||||||
|  |  | ||||||
|  | @ -211,7 +211,7 @@ class ConfigSchemaTraining(BaseModel): | ||||||
|     seed: Optional[StrictInt] = Field(..., title="Random seed") |     seed: Optional[StrictInt] = Field(..., title="Random seed") | ||||||
|     gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") |     gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") | ||||||
|     accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") |     accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") | ||||||
|     score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model") |     score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model") | ||||||
|     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") |     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") | ||||||
|     raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") |     raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") | ||||||
|     optimizer: Optimizer = Field(..., title="The optimizer to use") |     optimizer: Optimizer = Field(..., title="The optimizer to use") | ||||||
|  |  | ||||||
|  | @ -240,7 +240,7 @@ class Scorer: | ||||||
|                             pred_per_feat[field].add((gold_i, feat)) |                             pred_per_feat[field].add((gold_i, feat)) | ||||||
|             for field in per_feat: |             for field in per_feat: | ||||||
|                 per_feat[field].score_set( |                 per_feat[field].score_set( | ||||||
|                     pred_per_feat.get(field, set()), gold_per_feat.get(field, set()), |                     pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) | ||||||
|                 ) |                 ) | ||||||
|         result = {k: v.to_dict() for k, v in per_feat.items()} |         result = {k: v.to_dict() for k, v in per_feat.items()} | ||||||
|         return {f"{attr}_per_feat": result} |         return {f"{attr}_per_feat": result} | ||||||
|  | @ -418,9 +418,9 @@ class Scorer: | ||||||
|                     f_per_type[pred_label].fp += 1 |                     f_per_type[pred_label].fp += 1 | ||||||
|         micro_prf = PRFScore() |         micro_prf = PRFScore() | ||||||
|         for label_prf in f_per_type.values(): |         for label_prf in f_per_type.values(): | ||||||
|             micro_prf.tp = label_prf.tp |             micro_prf.tp += label_prf.tp | ||||||
|             micro_prf.fn = label_prf.fn |             micro_prf.fn += label_prf.fn | ||||||
|             micro_prf.fp = label_prf.fp |             micro_prf.fp += label_prf.fp | ||||||
|         n_cats = len(f_per_type) + 1e-100 |         n_cats = len(f_per_type) + 1e-100 | ||||||
|         macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats |         macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats | ||||||
|         macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats |         macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats | ||||||
|  |  | ||||||
|  | @ -144,6 +144,29 @@ def test_kb_empty(nlp): | ||||||
|         entity_linker.begin_training(lambda: []) |         entity_linker.begin_training(lambda: []) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_kb_serialize(nlp): | ||||||
|  |     """Test serialization of the KB""" | ||||||
|  |     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) | ||||||
|  |     with make_tempdir() as d: | ||||||
|  |         # normal read-write behaviour | ||||||
|  |         mykb.to_disk(d / "kb") | ||||||
|  |         mykb.from_disk(d / "kb") | ||||||
|  |         mykb.to_disk(d / "kb.file") | ||||||
|  |         mykb.from_disk(d / "kb.file") | ||||||
|  |         mykb.to_disk(d / "new" / "kb") | ||||||
|  |         mykb.from_disk(d / "new" / "kb") | ||||||
|  |         # allow overwriting an existing file | ||||||
|  |         mykb.to_disk(d / "kb.file") | ||||||
|  |         with pytest.raises(ValueError): | ||||||
|  |             # can not write to a directory | ||||||
|  |             mykb.to_disk(d) | ||||||
|  |         with pytest.raises(ValueError): | ||||||
|  |             # can not read from a directory | ||||||
|  |             mykb.from_disk(d) | ||||||
|  |         with pytest.raises(ValueError): | ||||||
|  |             # can not read from an unknown file | ||||||
|  |             mykb.from_disk(d / "unknown" / "kb") | ||||||
|  | 
 | ||||||
| def test_candidate_generation(nlp): | def test_candidate_generation(nlp): | ||||||
|     """Test correct candidate generation""" |     """Test correct candidate generation""" | ||||||
|     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) |     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) | ||||||
|  |  | ||||||
|  | @ -359,12 +359,8 @@ def test_language_factories_scores(): | ||||||
|     func = lambda nlp, name: lambda doc: doc |     func = lambda nlp, name: lambda doc: doc | ||||||
|     weights1 = {"a1": 0.5, "a2": 0.5} |     weights1 = {"a1": 0.5, "a2": 0.5} | ||||||
|     weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} |     weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} | ||||||
|     Language.factory( |     Language.factory(f"{name}1", default_score_weights=weights1, func=func) | ||||||
|         f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func, |     Language.factory(f"{name}2", default_score_weights=weights2, func=func) | ||||||
|     ) |  | ||||||
|     Language.factory( |  | ||||||
|         f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func, |  | ||||||
|     ) |  | ||||||
|     meta1 = Language.get_factory_meta(f"{name}1") |     meta1 = Language.get_factory_meta(f"{name}1") | ||||||
|     assert meta1.default_score_weights == weights1 |     assert meta1.default_score_weights == weights1 | ||||||
|     meta2 = Language.get_factory_meta(f"{name}2") |     meta2 = Language.get_factory_meta(f"{name}2") | ||||||
|  | @ -376,6 +372,21 @@ def test_language_factories_scores(): | ||||||
|     cfg = nlp.config["training"] |     cfg = nlp.config["training"] | ||||||
|     expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} |     expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} | ||||||
|     assert cfg["score_weights"] == expected_weights |     assert cfg["score_weights"] == expected_weights | ||||||
|  |     # Test with custom defaults | ||||||
|  |     config = nlp.config.copy() | ||||||
|  |     config["training"]["score_weights"]["a1"] = 0.0 | ||||||
|  |     config["training"]["score_weights"]["b3"] = 1.0 | ||||||
|  |     nlp = English.from_config(config) | ||||||
|  |     score_weights = nlp.config["training"]["score_weights"] | ||||||
|  |     expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34} | ||||||
|  |     assert score_weights == expected | ||||||
|  |     # Test with null values | ||||||
|  |     config = nlp.config.copy() | ||||||
|  |     config["training"]["score_weights"]["a1"] = None | ||||||
|  |     nlp = English.from_config(config) | ||||||
|  |     score_weights = nlp.config["training"]["score_weights"] | ||||||
|  |     expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35} | ||||||
|  |     assert score_weights == expected | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_pipe_factories_from_source(): | def test_pipe_factories_from_source(): | ||||||
|  |  | ||||||
|  | @ -8,6 +8,7 @@ from spacy.language import Language | ||||||
| from spacy.pipeline import TextCategorizer | from spacy.pipeline import TextCategorizer | ||||||
| from spacy.tokens import Doc | from spacy.tokens import Doc | ||||||
| from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL | from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL | ||||||
|  | from spacy.scorer import Scorer | ||||||
| 
 | 
 | ||||||
| from ..util import make_tempdir | from ..util import make_tempdir | ||||||
| from ...cli.train import verify_textcat_config | from ...cli.train import verify_textcat_config | ||||||
|  | @ -224,3 +225,31 @@ def test_positive_class_not_binary(): | ||||||
|     assert textcat.labels == ("SOME", "THING", "POS") |     assert textcat.labels == ("SOME", "THING", "POS") | ||||||
|     with pytest.raises(ValueError): |     with pytest.raises(ValueError): | ||||||
|         verify_textcat_config(nlp, pipe_config) |         verify_textcat_config(nlp, pipe_config) | ||||||
|  | 
 | ||||||
|  | def test_textcat_evaluation(): | ||||||
|  |     train_examples = [] | ||||||
|  |     nlp = English() | ||||||
|  |     ref1 = nlp("one") | ||||||
|  |     ref1.cats = {"winter": 1.0, "summer": 1.0, "spring": 1.0, "autumn": 1.0} | ||||||
|  |     pred1 = nlp("one") | ||||||
|  |     pred1.cats = {"winter": 1.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0} | ||||||
|  |     train_examples.append(Example(pred1, ref1)) | ||||||
|  | 
 | ||||||
|  |     ref2 = nlp("two") | ||||||
|  |     ref2.cats = {"winter": 0.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0} | ||||||
|  |     pred2 = nlp("two") | ||||||
|  |     pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0} | ||||||
|  |     train_examples.append(Example(pred2, ref2)) | ||||||
|  | 
 | ||||||
|  |     scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]) | ||||||
|  |     assert scores["cats_f_per_type"]["winter"]["p"] == 1/2 | ||||||
|  |     assert scores["cats_f_per_type"]["winter"]["r"] == 1/1 | ||||||
|  |     assert scores["cats_f_per_type"]["summer"]["p"] == 0 | ||||||
|  |     assert scores["cats_f_per_type"]["summer"]["r"] == 0/1 | ||||||
|  |     assert scores["cats_f_per_type"]["spring"]["p"] == 1/1 | ||||||
|  |     assert scores["cats_f_per_type"]["spring"]["r"] == 1/2 | ||||||
|  |     assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2 | ||||||
|  |     assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2 | ||||||
|  | 
 | ||||||
|  |     assert scores["cats_micro_p"] == 4/5 | ||||||
|  |     assert scores["cats_micro_r"] == 4/6 | ||||||
|  |  | ||||||
|  | @ -169,3 +169,22 @@ def test_tok2vec_listener(): | ||||||
|     nlp.select_pipes(disable="tok2vec") |     nlp.select_pipes(disable="tok2vec") | ||||||
|     assert nlp.pipe_names == ["tagger"] |     assert nlp.pipe_names == ["tagger"] | ||||||
|     nlp("Running the pipeline with the Tok2Vec component disabled.") |     nlp("Running the pipeline with the Tok2Vec component disabled.") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_tok2vec_listener_callback(): | ||||||
|  |     orig_config = Config().from_str(cfg_string) | ||||||
|  |     nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True) | ||||||
|  |     assert nlp.pipe_names == ["tok2vec", "tagger"] | ||||||
|  |     tagger = nlp.get_pipe("tagger") | ||||||
|  |     tok2vec = nlp.get_pipe("tok2vec") | ||||||
|  |     nlp._link_components() | ||||||
|  |     docs = [nlp.make_doc("A random sentence")] | ||||||
|  |     tok2vec.model.initialize(X=docs) | ||||||
|  |     gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs] | ||||||
|  |     label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")] | ||||||
|  |     tagger.model.initialize(X=docs, Y=label_sample) | ||||||
|  |     docs = [nlp.make_doc("Another entirely random sentence")] | ||||||
|  |     tok2vec.update([Example.from_dict(x, {}) for x in docs]) | ||||||
|  |     Y, get_dX = tagger.model.begin_update(docs) | ||||||
|  |     # assure that the backprop call works (and doesn't hit a 'None' callback) | ||||||
|  |     assert get_dX(Y) is not None | ||||||
|  |  | ||||||
|  | @ -3,7 +3,7 @@ from spacy.pipeline import Pipe | ||||||
| from spacy.matcher import PhraseMatcher, Matcher | from spacy.matcher import PhraseMatcher, Matcher | ||||||
| from spacy.tokens import Doc, Span, DocBin | from spacy.tokens import Doc, Span, DocBin | ||||||
| from spacy.training import Example, Corpus | from spacy.training import Example, Corpus | ||||||
| from spacy.training.converters import json2docs | from spacy.training.converters import json_to_docs | ||||||
| from spacy.vocab import Vocab | from spacy.vocab import Vocab | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.util import minibatch, ensure_path, load_model | from spacy.util import minibatch, ensure_path, load_model | ||||||
|  | @ -425,7 +425,7 @@ def test_issue4402(): | ||||||
|     attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] |     attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] | ||||||
|     with make_tempdir() as tmpdir: |     with make_tempdir() as tmpdir: | ||||||
|         output_file = tmpdir / "test4402.spacy" |         output_file = tmpdir / "test4402.spacy" | ||||||
|         docs = json2docs([json_data]) |         docs = json_to_docs([json_data]) | ||||||
|         data = DocBin(docs=docs, attrs=attrs).to_bytes() |         data = DocBin(docs=docs, attrs=attrs).to_bytes() | ||||||
|         with output_file.open("wb") as file_: |         with output_file.open("wb") as file_: | ||||||
|             file_.write(data) |             file_.write(data) | ||||||
|  |  | ||||||
|  | @ -1,7 +1,7 @@ | ||||||
| import pytest | import pytest | ||||||
| from spacy.tokens import Doc, Span, DocBin | from spacy.tokens import Doc, Span, DocBin | ||||||
| from spacy.training import Example | from spacy.training import Example | ||||||
| from spacy.training.converters.conllu2docs import conllu2docs | from spacy.training.converters.conllu_to_docs import conllu_to_docs | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.kb import KnowledgeBase | from spacy.kb import KnowledgeBase | ||||||
| from spacy.vocab import Vocab | from spacy.vocab import Vocab | ||||||
|  | @ -82,7 +82,7 @@ def test_issue4651_without_phrase_matcher_attr(): | ||||||
| 
 | 
 | ||||||
| def test_issue4665(): | def test_issue4665(): | ||||||
|     """ |     """ | ||||||
|     conllu2json should not raise an exception if the HEAD column contains an |     conllu_to_docs should not raise an exception if the HEAD column contains an | ||||||
|     underscore |     underscore | ||||||
|     """ |     """ | ||||||
|     input_data = """ |     input_data = """ | ||||||
|  | @ -105,7 +105,7 @@ def test_issue4665(): | ||||||
| 17	.	_	PUNCT	.	_	_	punct	_	_ | 17	.	_	PUNCT	.	_	_	punct	_	_ | ||||||
| 18	]	_	PUNCT	-RRB-	_	_	punct	_	_ | 18	]	_	PUNCT	-RRB-	_	_	punct	_	_ | ||||||
| """ | """ | ||||||
|     conllu2docs(input_data) |     conllu_to_docs(input_data) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_issue4674(): | def test_issue4674(): | ||||||
|  |  | ||||||
|  | @ -67,7 +67,8 @@ width = ${components.tok2vec.model.width} | ||||||
| parser_config_string = """ | parser_config_string = """ | ||||||
| [model] | [model] | ||||||
| @architectures = "spacy.TransitionBasedParser.v1" | @architectures = "spacy.TransitionBasedParser.v1" | ||||||
| nr_feature_tokens = 99 | state_type = "parser" | ||||||
|  | extra_state_tokens = false | ||||||
| hidden_width = 66 | hidden_width = 66 | ||||||
| maxout_pieces = 2 | maxout_pieces = 2 | ||||||
| 
 | 
 | ||||||
|  | @ -95,7 +96,11 @@ def my_parser(): | ||||||
|         MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), |         MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), | ||||||
|     ) |     ) | ||||||
|     parser = build_tb_parser_model( |     parser = build_tb_parser_model( | ||||||
|         tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 |         tok2vec=tok2vec, | ||||||
|  |         state_type="parser", | ||||||
|  |         extra_state_tokens=True, | ||||||
|  |         hidden_width=65, | ||||||
|  |         maxout_pieces=5, | ||||||
|     ) |     ) | ||||||
|     return parser |     return parser | ||||||
| 
 | 
 | ||||||
|  | @ -340,3 +345,13 @@ def test_config_auto_fill_extra_fields(): | ||||||
|     assert "extra" not in nlp.config["training"] |     assert "extra" not in nlp.config["training"] | ||||||
|     # Make sure the config generated is valid |     # Make sure the config generated is valid | ||||||
|     load_model_from_config(nlp.config) |     load_model_from_config(nlp.config) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_config_validate_literal(): | ||||||
|  |     nlp = English() | ||||||
|  |     config = Config().from_str(parser_config_string) | ||||||
|  |     config["model"]["state_type"] = "nonsense" | ||||||
|  |     with pytest.raises(ConfigValidationError): | ||||||
|  |         nlp.add_pipe("parser", config=config) | ||||||
|  |     config["model"]["state_type"] = "ner" | ||||||
|  |     nlp.add_pipe("parser", config=config) | ||||||
|  |  | ||||||
|  | @ -1,20 +1,21 @@ | ||||||
| import pytest | import pytest | ||||||
| from click import NoSuchOption | from click import NoSuchOption | ||||||
| from spacy.training import docs_to_json, biluo_tags_from_offsets | from spacy.training import docs_to_json, offsets_to_biluo_tags | ||||||
| from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs | from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs | ||||||
| from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate | from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate | ||||||
| from spacy.cli.init_config import init_config, RECOMMENDATIONS | from spacy.cli.init_config import init_config, RECOMMENDATIONS | ||||||
| from spacy.cli._util import validate_project_commands, parse_config_overrides | from spacy.cli._util import validate_project_commands, parse_config_overrides | ||||||
| from spacy.cli._util import load_project_config, substitute_project_variables | from spacy.cli._util import load_project_config, substitute_project_variables | ||||||
| from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR | from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR | ||||||
| from thinc.config import ConfigValidationError | from spacy.cli.debug_config import check_section_refs | ||||||
|  | from thinc.config import ConfigValidationError, Config | ||||||
| import srsly | import srsly | ||||||
| import os | import os | ||||||
| 
 | 
 | ||||||
| from .util import make_tempdir | from .util import make_tempdir | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_cli_converters_conllu2json(): | def test_cli_converters_conllu_to_docs(): | ||||||
|     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu |     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu | ||||||
|     lines = [ |     lines = [ | ||||||
|         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", |         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", | ||||||
|  | @ -23,7 +24,7 @@ def test_cli_converters_conllu2json(): | ||||||
|         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", |         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", | ||||||
|     ] |     ] | ||||||
|     input_data = "\n".join(lines) |     input_data = "\n".join(lines) | ||||||
|     converted_docs = conllu2docs(input_data, n_sents=1) |     converted_docs = conllu_to_docs(input_data, n_sents=1) | ||||||
|     assert len(converted_docs) == 1 |     assert len(converted_docs) == 1 | ||||||
|     converted = [docs_to_json(converted_docs)] |     converted = [docs_to_json(converted_docs)] | ||||||
|     assert converted[0]["id"] == 0 |     assert converted[0]["id"] == 0 | ||||||
|  | @ -39,7 +40,7 @@ def test_cli_converters_conllu2json(): | ||||||
|     ent_offsets = [ |     ent_offsets = [ | ||||||
|         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] |         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] | ||||||
|     ] |     ] | ||||||
|     biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") |     biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") | ||||||
|     assert biluo_tags == ["O", "B-PER", "L-PER", "O"] |     assert biluo_tags == ["O", "B-PER", "L-PER", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -62,9 +63,9 @@ def test_cli_converters_conllu2json(): | ||||||
|         ), |         ), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_cli_converters_conllu2json_name_ner_map(lines): | def test_cli_converters_conllu_to_docs_name_ner_map(lines): | ||||||
|     input_data = "\n".join(lines) |     input_data = "\n".join(lines) | ||||||
|     converted_docs = conllu2docs( |     converted_docs = conllu_to_docs( | ||||||
|         input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""} |         input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""} | ||||||
|     ) |     ) | ||||||
|     assert len(converted_docs) == 1 |     assert len(converted_docs) == 1 | ||||||
|  | @ -83,11 +84,11 @@ def test_cli_converters_conllu2json_name_ner_map(lines): | ||||||
|     ent_offsets = [ |     ent_offsets = [ | ||||||
|         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] |         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] | ||||||
|     ] |     ] | ||||||
|     biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") |     biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") | ||||||
|     assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] |     assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_cli_converters_conllu2json_subtokens(): | def test_cli_converters_conllu_to_docs_subtokens(): | ||||||
|     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu |     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu | ||||||
|     lines = [ |     lines = [ | ||||||
|         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", |         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", | ||||||
|  | @ -98,7 +99,7 @@ def test_cli_converters_conllu2json_subtokens(): | ||||||
|         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", |         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", | ||||||
|     ] |     ] | ||||||
|     input_data = "\n".join(lines) |     input_data = "\n".join(lines) | ||||||
|     converted_docs = conllu2docs( |     converted_docs = conllu_to_docs( | ||||||
|         input_data, n_sents=1, merge_subtokens=True, append_morphology=True |         input_data, n_sents=1, merge_subtokens=True, append_morphology=True | ||||||
|     ) |     ) | ||||||
|     assert len(converted_docs) == 1 |     assert len(converted_docs) == 1 | ||||||
|  | @ -132,11 +133,11 @@ def test_cli_converters_conllu2json_subtokens(): | ||||||
|     ent_offsets = [ |     ent_offsets = [ | ||||||
|         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] |         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] | ||||||
|     ] |     ] | ||||||
|     biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") |     biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") | ||||||
|     assert biluo_tags == ["O", "U-PER", "O", "O"] |     assert biluo_tags == ["O", "U-PER", "O", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_cli_converters_iob2json(): | def test_cli_converters_iob_to_docs(): | ||||||
|     lines = [ |     lines = [ | ||||||
|         "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", |         "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", | ||||||
|         "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", |         "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", | ||||||
|  | @ -144,7 +145,7 @@ def test_cli_converters_iob2json(): | ||||||
|         "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O", |         "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O", | ||||||
|     ] |     ] | ||||||
|     input_data = "\n".join(lines) |     input_data = "\n".join(lines) | ||||||
|     converted_docs = iob2docs(input_data, n_sents=10) |     converted_docs = iob_to_docs(input_data, n_sents=10) | ||||||
|     assert len(converted_docs) == 1 |     assert len(converted_docs) == 1 | ||||||
|     converted = docs_to_json(converted_docs) |     converted = docs_to_json(converted_docs) | ||||||
|     assert converted["id"] == 0 |     assert converted["id"] == 0 | ||||||
|  | @ -161,7 +162,7 @@ def test_cli_converters_iob2json(): | ||||||
|         assert ent.text in ["New York City", "London"] |         assert ent.text in ["New York City", "London"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_cli_converters_conll_ner2json(): | def test_cli_converters_conll_ner_to_docs(): | ||||||
|     lines = [ |     lines = [ | ||||||
|         "-DOCSTART- -X- O O", |         "-DOCSTART- -X- O O", | ||||||
|         "", |         "", | ||||||
|  | @ -211,7 +212,7 @@ def test_cli_converters_conll_ner2json(): | ||||||
|         ".\t.\t_\tO", |         ".\t.\t_\tO", | ||||||
|     ] |     ] | ||||||
|     input_data = "\n".join(lines) |     input_data = "\n".join(lines) | ||||||
|     converted_docs = conll_ner2docs(input_data, n_sents=10) |     converted_docs = conll_ner_to_docs(input_data, n_sents=10) | ||||||
|     assert len(converted_docs) == 1 |     assert len(converted_docs) == 1 | ||||||
|     converted = docs_to_json(converted_docs) |     converted = docs_to_json(converted_docs) | ||||||
|     assert converted["id"] == 0 |     assert converted["id"] == 0 | ||||||
|  | @ -413,3 +414,15 @@ def test_string_to_list(value): | ||||||
| def test_string_to_list_intify(value): | def test_string_to_list_intify(value): | ||||||
|     assert string_to_list(value, intify=False) == ["1", "2", "3"] |     assert string_to_list(value, intify=False) == ["1", "2", "3"] | ||||||
|     assert string_to_list(value, intify=True) == [1, 2, 3] |     assert string_to_list(value, intify=True) == [1, 2, 3] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_check_section_refs(): | ||||||
|  |     config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}} | ||||||
|  |     config = Config(config) | ||||||
|  |     # Valid section reference | ||||||
|  |     check_section_refs(config, ["a.b.c"]) | ||||||
|  |     # Section that doesn't exist in this config | ||||||
|  |     check_section_refs(config, ["x.y.z"]) | ||||||
|  |     # Invalid section reference | ||||||
|  |     with pytest.raises(ConfigValidationError): | ||||||
|  |         check_section_refs(config, ["a.b.c", "f.g"]) | ||||||
|  |  | ||||||
|  | @ -2,7 +2,7 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal | ||||||
| import pytest | import pytest | ||||||
| from pytest import approx | from pytest import approx | ||||||
| from spacy.training import Example | from spacy.training import Example | ||||||
| from spacy.training.iob_utils import biluo_tags_from_offsets | from spacy.training.iob_utils import offsets_to_biluo_tags | ||||||
| from spacy.scorer import Scorer, ROCAUCScore | from spacy.scorer import Scorer, ROCAUCScore | ||||||
| from spacy.scorer import _roc_auc_score, _roc_curve | from spacy.scorer import _roc_auc_score, _roc_curve | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
|  | @ -186,7 +186,7 @@ def test_ner_per_type(en_vocab): | ||||||
|             words=input_.split(" "), |             words=input_.split(" "), | ||||||
|             ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)], |             ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)], | ||||||
|         ) |         ) | ||||||
|         entities = biluo_tags_from_offsets(doc, annot["entities"]) |         entities = offsets_to_biluo_tags(doc, annot["entities"]) | ||||||
|         example = Example.from_dict(doc, {"entities": entities}) |         example = Example.from_dict(doc, {"entities": entities}) | ||||||
|         # a hack for sentence boundaries |         # a hack for sentence boundaries | ||||||
|         example.predicted[1].is_sent_start = False |         example.predicted[1].is_sent_start = False | ||||||
|  | @ -211,7 +211,7 @@ def test_ner_per_type(en_vocab): | ||||||
|             words=input_.split(" "), |             words=input_.split(" "), | ||||||
|             ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)], |             ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)], | ||||||
|         ) |         ) | ||||||
|         entities = biluo_tags_from_offsets(doc, annot["entities"]) |         entities = offsets_to_biluo_tags(doc, annot["entities"]) | ||||||
|         example = Example.from_dict(doc, {"entities": entities}) |         example = Example.from_dict(doc, {"entities": entities}) | ||||||
|         # a hack for sentence boundaries |         # a hack for sentence boundaries | ||||||
|         example.predicted[1].is_sent_start = False |         example.predicted[1].is_sent_start = False | ||||||
|  |  | ||||||
|  | @ -1,9 +1,9 @@ | ||||||
| import numpy | import numpy | ||||||
| from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment | from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment | ||||||
| from spacy.training import spans_from_biluo_tags, iob_to_biluo | from spacy.training import biluo_tags_to_spans, iob_to_biluo | ||||||
| from spacy.training import Corpus, docs_to_json | from spacy.training import Corpus, docs_to_json | ||||||
| from spacy.training.example import Example | from spacy.training.example import Example | ||||||
| from spacy.training.converters import json2docs | from spacy.training.converters import json_to_docs | ||||||
| from spacy.training.augment import make_orth_variants_example | from spacy.training.augment import make_orth_variants_example | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.tokens import Doc, DocBin | from spacy.tokens import Doc, DocBin | ||||||
|  | @ -69,7 +69,7 @@ def test_gold_biluo_U(en_vocab): | ||||||
|     spaces = [True, True, True, False, True] |     spaces = [True, True, True, False, True] | ||||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) |     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||||
|     entities = [(len("I flew to "), len("I flew to London"), "LOC")] |     entities = [(len("I flew to "), len("I flew to London"), "LOC")] | ||||||
|     tags = biluo_tags_from_offsets(doc, entities) |     tags = offsets_to_biluo_tags(doc, entities) | ||||||
|     assert tags == ["O", "O", "O", "U-LOC", "O"] |     assert tags == ["O", "O", "O", "U-LOC", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -78,7 +78,7 @@ def test_gold_biluo_BL(en_vocab): | ||||||
|     spaces = [True, True, True, True, False, True] |     spaces = [True, True, True, True, False, True] | ||||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) |     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||||
|     entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")] |     entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")] | ||||||
|     tags = biluo_tags_from_offsets(doc, entities) |     tags = offsets_to_biluo_tags(doc, entities) | ||||||
|     assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"] |     assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -87,7 +87,7 @@ def test_gold_biluo_BIL(en_vocab): | ||||||
|     spaces = [True, True, True, True, True, False, True] |     spaces = [True, True, True, True, True, False, True] | ||||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) |     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||||
|     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] |     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] | ||||||
|     tags = biluo_tags_from_offsets(doc, entities) |     tags = offsets_to_biluo_tags(doc, entities) | ||||||
|     assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] |     assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -100,7 +100,7 @@ def test_gold_biluo_overlap(en_vocab): | ||||||
|         (len("I flew to "), len("I flew to San Francisco"), "LOC"), |         (len("I flew to "), len("I flew to San Francisco"), "LOC"), | ||||||
|     ] |     ] | ||||||
|     with pytest.raises(ValueError): |     with pytest.raises(ValueError): | ||||||
|         biluo_tags_from_offsets(doc, entities) |         offsets_to_biluo_tags(doc, entities) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_gold_biluo_misalign(en_vocab): | def test_gold_biluo_misalign(en_vocab): | ||||||
|  | @ -109,7 +109,7 @@ def test_gold_biluo_misalign(en_vocab): | ||||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) |     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||||
|     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] |     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] | ||||||
|     with pytest.warns(UserWarning): |     with pytest.warns(UserWarning): | ||||||
|         tags = biluo_tags_from_offsets(doc, entities) |         tags = offsets_to_biluo_tags(doc, entities) | ||||||
|     assert tags == ["O", "O", "O", "-", "-", "-"] |     assert tags == ["O", "O", "O", "-", "-", "-"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -155,7 +155,7 @@ def test_example_from_dict_some_ner(en_vocab): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.filterwarnings("ignore::UserWarning") | @pytest.mark.filterwarnings("ignore::UserWarning") | ||||||
| def test_json2docs_no_ner(en_vocab): | def test_json_to_docs_no_ner(en_vocab): | ||||||
|     data = [ |     data = [ | ||||||
|         { |         { | ||||||
|             "id": 1, |             "id": 1, | ||||||
|  | @ -191,7 +191,7 @@ def test_json2docs_no_ner(en_vocab): | ||||||
|             ], |             ], | ||||||
|         } |         } | ||||||
|     ] |     ] | ||||||
|     docs = json2docs(data) |     docs = json_to_docs(data) | ||||||
|     assert len(docs) == 1 |     assert len(docs) == 1 | ||||||
|     for doc in docs: |     for doc in docs: | ||||||
|         assert not doc.has_annotation("ENT_IOB") |         assert not doc.has_annotation("ENT_IOB") | ||||||
|  | @ -358,9 +358,9 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer): | ||||||
|     biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] |     biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] | ||||||
|     offsets = [(10, 24, "LOC"), (29, 35, "GPE")] |     offsets = [(10, 24, "LOC"), (29, 35, "GPE")] | ||||||
|     doc = en_tokenizer(text) |     doc = en_tokenizer(text) | ||||||
|     biluo_tags_converted = biluo_tags_from_offsets(doc, offsets) |     biluo_tags_converted = offsets_to_biluo_tags(doc, offsets) | ||||||
|     assert biluo_tags_converted == biluo_tags |     assert biluo_tags_converted == biluo_tags | ||||||
|     offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) |     offsets_converted = biluo_tags_to_offsets(doc, biluo_tags) | ||||||
|     offsets_converted = [ent for ent in offsets if ent[2]] |     offsets_converted = [ent for ent in offsets if ent[2]] | ||||||
|     assert offsets_converted == offsets |     assert offsets_converted == offsets | ||||||
| 
 | 
 | ||||||
|  | @ -368,7 +368,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer): | ||||||
| def test_biluo_spans(en_tokenizer): | def test_biluo_spans(en_tokenizer): | ||||||
|     doc = en_tokenizer("I flew to Silicon Valley via London.") |     doc = en_tokenizer("I flew to Silicon Valley via London.") | ||||||
|     biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] |     biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] | ||||||
|     spans = spans_from_biluo_tags(doc, biluo_tags) |     spans = biluo_tags_to_spans(doc, biluo_tags) | ||||||
|     spans = [span for span in spans if span.label_] |     spans = [span for span in spans if span.label_] | ||||||
|     assert len(spans) == 2 |     assert len(spans) == 2 | ||||||
|     assert spans[0].text == "Silicon Valley" |     assert spans[0].text == "Silicon Valley" | ||||||
|  |  | ||||||
|  | @ -2,8 +2,8 @@ from .corpus import Corpus  # noqa: F401 | ||||||
| from .example import Example, validate_examples  # noqa: F401 | from .example import Example, validate_examples  # noqa: F401 | ||||||
| from .align import Alignment  # noqa: F401 | from .align import Alignment  # noqa: F401 | ||||||
| from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401 | from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401 | ||||||
| from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags  # noqa: F401 | from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401 | ||||||
| from .iob_utils import spans_from_biluo_tags, tags_to_entities  # noqa: F401 | from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401 | ||||||
| from .gold_io import docs_to_json, read_json_file  # noqa: F401 | from .gold_io import docs_to_json, read_json_file  # noqa: F401 | ||||||
| from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401 | from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401 | ||||||
| from .loggers import console_logger, wandb_logger  # noqa: F401 | from .loggers import console_logger, wandb_logger  # noqa: F401 | ||||||
|  |  | ||||||
|  | @ -1,4 +1,4 @@ | ||||||
| from .iob2docs import iob2docs  # noqa: F401 | from .iob_to_docs import iob_to_docs  # noqa: F401 | ||||||
| from .conll_ner2docs import conll_ner2docs  # noqa: F401 | from .conll_ner_to_docs import conll_ner_to_docs  # noqa: F401 | ||||||
| from .json2docs import json2docs  # noqa: F401 | from .json_to_docs import json_to_docs  # noqa: F401 | ||||||
| from .conllu2docs import conllu2docs  # noqa: F401 | from .conllu_to_docs import conllu_to_docs  # noqa: F401 | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ from ...tokens import Doc, Span | ||||||
| from ...util import load_model | from ...util import load_model | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def conll_ner2docs( | def conll_ner_to_docs( | ||||||
|     input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs |     input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|  | @ -1,13 +1,13 @@ | ||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| from .conll_ner2docs import n_sents_info | from .conll_ner_to_docs import n_sents_info | ||||||
| from ...training import iob_to_biluo, spans_from_biluo_tags | from ...training import iob_to_biluo, biluo_tags_to_spans | ||||||
| from ...tokens import Doc, Token, Span | from ...tokens import Doc, Token, Span | ||||||
| from ...vocab import Vocab | from ...vocab import Vocab | ||||||
| from wasabi import Printer | from wasabi import Printer | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def conllu2docs( | def conllu_to_docs( | ||||||
|     input_data, |     input_data, | ||||||
|     n_sents=10, |     n_sents=10, | ||||||
|     append_morphology=False, |     append_morphology=False, | ||||||
|  | @ -78,7 +78,7 @@ def read_conllx( | ||||||
|         if lines: |         if lines: | ||||||
|             while lines[0].startswith("#"): |             while lines[0].startswith("#"): | ||||||
|                 lines.pop(0) |                 lines.pop(0) | ||||||
|             doc = doc_from_conllu_sentence( |             doc = conllu_sentence_to_doc( | ||||||
|                 vocab, |                 vocab, | ||||||
|                 lines, |                 lines, | ||||||
|                 ner_tag_pattern, |                 ner_tag_pattern, | ||||||
|  | @ -128,7 +128,7 @@ def get_entities(lines, tag_pattern, ner_map=None): | ||||||
|     return iob_to_biluo(iob) |     return iob_to_biluo(iob) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def doc_from_conllu_sentence( | def conllu_sentence_to_doc( | ||||||
|     vocab, |     vocab, | ||||||
|     lines, |     lines, | ||||||
|     ner_tag_pattern, |     ner_tag_pattern, | ||||||
|  | @ -215,7 +215,7 @@ def doc_from_conllu_sentence( | ||||||
|         doc[i]._.merged_lemma = lemmas[i] |         doc[i]._.merged_lemma = lemmas[i] | ||||||
|         doc[i]._.merged_spaceafter = spaces[i] |         doc[i]._.merged_spaceafter = spaces[i] | ||||||
|     ents = get_entities(lines, ner_tag_pattern, ner_map) |     ents = get_entities(lines, ner_tag_pattern, ner_map) | ||||||
|     doc.ents = spans_from_biluo_tags(doc, ents) |     doc.ents = biluo_tags_to_spans(doc, ents) | ||||||
| 
 | 
 | ||||||
|     if merge_subtokens: |     if merge_subtokens: | ||||||
|         doc = merge_conllu_subtokens(lines, doc) |         doc = merge_conllu_subtokens(lines, doc) | ||||||
|  | @ -1,13 +1,13 @@ | ||||||
| from wasabi import Printer | from wasabi import Printer | ||||||
| 
 | 
 | ||||||
| from .conll_ner2docs import n_sents_info | from .conll_ner_to_docs import n_sents_info | ||||||
| from ...vocab import Vocab | from ...vocab import Vocab | ||||||
| from ...training import iob_to_biluo, tags_to_entities | from ...training import iob_to_biluo, tags_to_entities | ||||||
| from ...tokens import Doc, Span | from ...tokens import Doc, Span | ||||||
| from ...util import minibatch | from ...util import minibatch | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs): | def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs): | ||||||
|     """ |     """ | ||||||
|     Convert IOB files with one sentence per line and tags separated with '|' |     Convert IOB files with one sentence per line and tags separated with '|' | ||||||
|     into Doc objects so they can be saved. IOB and IOB2 are accepted. |     into Doc objects so they can be saved. IOB and IOB2 are accepted. | ||||||
|  | @ -1,12 +1,12 @@ | ||||||
| import srsly | import srsly | ||||||
| from ..gold_io import json_iterate, json_to_annotations | from ..gold_io import json_iterate, json_to_annotations | ||||||
| from ..example import annotations2doc | from ..example import annotations_to_doc | ||||||
| from ..example import _fix_legacy_dict_data, _parse_example_dict_data | from ..example import _fix_legacy_dict_data, _parse_example_dict_data | ||||||
| from ...util import load_model | from ...util import load_model | ||||||
| from ...lang.xx import MultiLanguage | from ...lang.xx import MultiLanguage | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def json2docs(input_data, model=None, **kwargs): | def json_to_docs(input_data, model=None, **kwargs): | ||||||
|     nlp = load_model(model) if model is not None else MultiLanguage() |     nlp = load_model(model) if model is not None else MultiLanguage() | ||||||
|     if not isinstance(input_data, bytes): |     if not isinstance(input_data, bytes): | ||||||
|         if not isinstance(input_data, str): |         if not isinstance(input_data, str): | ||||||
|  | @ -17,6 +17,6 @@ def json2docs(input_data, model=None, **kwargs): | ||||||
|         for json_para in json_to_annotations(json_doc): |         for json_para in json_to_annotations(json_doc): | ||||||
|             example_dict = _fix_legacy_dict_data(json_para) |             example_dict = _fix_legacy_dict_data(json_para) | ||||||
|             tok_dict, doc_dict = _parse_example_dict_data(example_dict) |             tok_dict, doc_dict = _parse_example_dict_data(example_dict) | ||||||
|             doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) |             doc = annotations_to_doc(nlp.vocab, tok_dict, doc_dict) | ||||||
|             docs.append(doc) |             docs.append(doc) | ||||||
|     return docs |     return docs | ||||||
|  | @ -7,13 +7,13 @@ from ..tokens.span cimport Span | ||||||
| from ..tokens.span import Span | from ..tokens.span import Span | ||||||
| from ..attrs import IDS | from ..attrs import IDS | ||||||
| from .align import Alignment | from .align import Alignment | ||||||
| from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc | from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags | ||||||
| from .iob_utils import spans_from_biluo_tags | from .iob_utils import biluo_tags_to_spans | ||||||
| from ..errors import Errors, Warnings | from ..errors import Errors, Warnings | ||||||
| from ..pipeline._parser_internals import nonproj | from ..pipeline._parser_internals import nonproj | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): | cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): | ||||||
|     """ Create a Doc from dictionaries with token and doc annotations. """ |     """ Create a Doc from dictionaries with token and doc annotations. """ | ||||||
|     attrs, array = _annot2array(vocab, tok_annot, doc_annot) |     attrs, array = _annot2array(vocab, tok_annot, doc_annot) | ||||||
|     output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) |     output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) | ||||||
|  | @ -92,7 +92,7 @@ cdef class Example: | ||||||
|             tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] |             tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] | ||||||
|         return Example( |         return Example( | ||||||
|             predicted, |             predicted, | ||||||
|             annotations2doc(predicted.vocab, tok_dict, doc_dict) |             annotations_to_doc(predicted.vocab, tok_dict, doc_dict) | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|  | @ -176,7 +176,7 @@ cdef class Example: | ||||||
|             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ? |             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ? | ||||||
|         x_ents = self.get_aligned_spans_y2x(self.y.ents) |         x_ents = self.get_aligned_spans_y2x(self.y.ents) | ||||||
|         # Default to 'None' for missing values |         # Default to 'None' for missing values | ||||||
|         x_tags = biluo_tags_from_offsets( |         x_tags = offsets_to_biluo_tags( | ||||||
|             self.x, |             self.x, | ||||||
|             [(e.start_char, e.end_char, e.label_) for e in x_ents], |             [(e.start_char, e.end_char, e.label_) for e in x_ents], | ||||||
|             missing=None |             missing=None | ||||||
|  | @ -195,7 +195,7 @@ cdef class Example: | ||||||
|         return { |         return { | ||||||
|             "doc_annotation": { |             "doc_annotation": { | ||||||
|                 "cats": dict(self.reference.cats), |                 "cats": dict(self.reference.cats), | ||||||
|                 "entities": biluo_tags_from_doc(self.reference), |                 "entities": doc_to_biluo_tags(self.reference), | ||||||
|                 "links": self._links_to_dict() |                 "links": self._links_to_dict() | ||||||
|             }, |             }, | ||||||
|             "token_annotation": { |             "token_annotation": { | ||||||
|  | @ -295,12 +295,12 @@ def _add_entities_to_doc(doc, ner_data): | ||||||
|     elif isinstance(ner_data[0], tuple): |     elif isinstance(ner_data[0], tuple): | ||||||
|         return _add_entities_to_doc( |         return _add_entities_to_doc( | ||||||
|             doc, |             doc, | ||||||
|             biluo_tags_from_offsets(doc, ner_data) |             offsets_to_biluo_tags(doc, ner_data) | ||||||
|         ) |         ) | ||||||
|     elif isinstance(ner_data[0], str) or ner_data[0] is None: |     elif isinstance(ner_data[0], str) or ner_data[0] is None: | ||||||
|         return _add_entities_to_doc( |         return _add_entities_to_doc( | ||||||
|             doc, |             doc, | ||||||
|             spans_from_biluo_tags(doc, ner_data) |             biluo_tags_to_spans(doc, ner_data) | ||||||
|         ) |         ) | ||||||
|     elif isinstance(ner_data[0], Span): |     elif isinstance(ner_data[0], Span): | ||||||
|         # Ugh, this is super messy. Really hard to set O entities |         # Ugh, this is super messy. Really hard to set O entities | ||||||
|  | @ -388,7 +388,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): | ||||||
|         # This is annoying but to convert the offsets we need a Doc |         # This is annoying but to convert the offsets we need a Doc | ||||||
|         # that has the target tokenization. |         # that has the target tokenization. | ||||||
|         reference = Doc(vocab, words=words, spaces=spaces) |         reference = Doc(vocab, words=words, spaces=spaces) | ||||||
|         biluo = biluo_tags_from_offsets(reference, biluo_or_offsets) |         biluo = offsets_to_biluo_tags(reference, biluo_or_offsets) | ||||||
|     else: |     else: | ||||||
|         biluo = biluo_or_offsets |         biluo = biluo_or_offsets | ||||||
|     ent_iobs = [] |     ent_iobs = [] | ||||||
|  |  | ||||||
|  | @ -3,7 +3,7 @@ import srsly | ||||||
| from .. import util | from .. import util | ||||||
| from ..errors import Warnings | from ..errors import Warnings | ||||||
| from ..tokens import Doc | from ..tokens import Doc | ||||||
| from .iob_utils import biluo_tags_from_offsets, tags_to_entities | from .iob_utils import offsets_to_biluo_tags, tags_to_entities | ||||||
| import json | import json | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -32,7 +32,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): | ||||||
|             if ent.kb_id_: |             if ent.kb_id_: | ||||||
|                 link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} |                 link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} | ||||||
|                 json_para["links"].append(link_dict) |                 json_para["links"].append(link_dict) | ||||||
|         biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag) |         biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag) | ||||||
|         attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB") |         attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB") | ||||||
|         include_annotation = {attr: doc.has_annotation(attr) for attr in attrs} |         include_annotation = {attr: doc.has_annotation(attr) for attr in attrs} | ||||||
|         for j, sent in enumerate(doc.sents): |         for j, sent in enumerate(doc.sents): | ||||||
|  |  | ||||||
|  | @ -1,9 +1,11 @@ | ||||||
|  | from typing import List, Tuple, Iterable, Union, Iterator | ||||||
| import warnings | import warnings | ||||||
|  | 
 | ||||||
| from ..errors import Errors, Warnings | from ..errors import Errors, Warnings | ||||||
| from ..tokens import Span | from ..tokens import Span, Doc | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def iob_to_biluo(tags): | def iob_to_biluo(tags: Iterable[str]) -> List[str]: | ||||||
|     out = [] |     out = [] | ||||||
|     tags = list(tags) |     tags = list(tags) | ||||||
|     while tags: |     while tags: | ||||||
|  | @ -12,7 +14,7 @@ def iob_to_biluo(tags): | ||||||
|     return out |     return out | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def biluo_to_iob(tags): | def biluo_to_iob(tags: Iterable[str]) -> List[str]: | ||||||
|     out = [] |     out = [] | ||||||
|     for tag in tags: |     for tag in tags: | ||||||
|         if tag is None: |         if tag is None: | ||||||
|  | @ -23,12 +25,12 @@ def biluo_to_iob(tags): | ||||||
|     return out |     return out | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _consume_os(tags): | def _consume_os(tags: List[str]) -> Iterator[str]: | ||||||
|     while tags and tags[0] == "O": |     while tags and tags[0] == "O": | ||||||
|         yield tags.pop(0) |         yield tags.pop(0) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _consume_ent(tags): | def _consume_ent(tags: List[str]) -> List[str]: | ||||||
|     if not tags: |     if not tags: | ||||||
|         return [] |         return [] | ||||||
|     tag = tags.pop(0) |     tag = tags.pop(0) | ||||||
|  | @ -50,15 +52,17 @@ def _consume_ent(tags): | ||||||
|         return [start] + middle + [end] |         return [start] + middle + [end] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def biluo_tags_from_doc(doc, missing="O"): | def doc_to_biluo_tags(doc: Doc, missing: str = "O"): | ||||||
|     return biluo_tags_from_offsets( |     return offsets_to_biluo_tags( | ||||||
|         doc, |         doc, | ||||||
|         [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], |         [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], | ||||||
|         missing=missing, |         missing=missing, | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def biluo_tags_from_offsets(doc, entities, missing="O"): | def offsets_to_biluo_tags( | ||||||
|  |     doc: Doc, entities: Iterable[Tuple[int, int, Union[str, int]]], missing: str = "O" | ||||||
|  | ) -> List[str]: | ||||||
|     """Encode labelled spans into per-token tags, using the |     """Encode labelled spans into per-token tags, using the | ||||||
|     Begin/In/Last/Unit/Out scheme (BILUO). |     Begin/In/Last/Unit/Out scheme (BILUO). | ||||||
| 
 | 
 | ||||||
|  | @ -69,7 +73,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): | ||||||
|         the original string. |         the original string. | ||||||
|     RETURNS (list): A list of unicode strings, describing the tags. Each tag |     RETURNS (list): A list of unicode strings, describing the tags. Each tag | ||||||
|         string will be of the form either "", "O" or "{action}-{label}", where |         string will be of the form either "", "O" or "{action}-{label}", where | ||||||
|         action is one of "B", "I", "L", "U". The string "-" is used where the |         action is one of "B", "I", "L", "U". The missing label is used where the | ||||||
|         entity offsets don't align with the tokenization in the `Doc` object. |         entity offsets don't align with the tokenization in the `Doc` object. | ||||||
|         The training algorithm will view these as missing values. "O" denotes a |         The training algorithm will view these as missing values. "O" denotes a | ||||||
|         non-entity token. "B" denotes the beginning of a multi-token entity, |         non-entity token. "B" denotes the beginning of a multi-token entity, | ||||||
|  | @ -80,12 +84,11 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): | ||||||
|         >>> text = 'I like London.' |         >>> text = 'I like London.' | ||||||
|         >>> entities = [(len('I like '), len('I like London'), 'LOC')] |         >>> entities = [(len('I like '), len('I like London'), 'LOC')] | ||||||
|         >>> doc = nlp.tokenizer(text) |         >>> doc = nlp.tokenizer(text) | ||||||
|         >>> tags = biluo_tags_from_offsets(doc, entities) |         >>> tags = offsets_to_biluo_tags(doc, entities) | ||||||
|         >>> assert tags == ["O", "O", 'U-LOC', "O"] |         >>> assert tags == ["O", "O", 'U-LOC', "O"] | ||||||
|     """ |     """ | ||||||
|     # Ensure no overlapping entity labels exist |     # Ensure no overlapping entity labels exist | ||||||
|     tokens_in_ents = {} |     tokens_in_ents = {} | ||||||
| 
 |  | ||||||
|     starts = {token.idx: token.i for token in doc} |     starts = {token.idx: token.i for token in doc} | ||||||
|     ends = {token.idx + len(token): token.i for token in doc} |     ends = {token.idx + len(token): token.i for token in doc} | ||||||
|     biluo = ["-" for _ in doc] |     biluo = ["-" for _ in doc] | ||||||
|  | @ -109,7 +112,6 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): | ||||||
|                         ) |                         ) | ||||||
|                     ) |                     ) | ||||||
|                 tokens_in_ents[token_index] = (start_char, end_char, label) |                 tokens_in_ents[token_index] = (start_char, end_char, label) | ||||||
| 
 |  | ||||||
|             start_token = starts.get(start_char) |             start_token = starts.get(start_char) | ||||||
|             end_token = ends.get(end_char) |             end_token = ends.get(end_char) | ||||||
|             # Only interested if the tokenization is correct |             # Only interested if the tokenization is correct | ||||||
|  | @ -143,7 +145,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): | ||||||
|     return biluo |     return biluo | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def spans_from_biluo_tags(doc, tags): | def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]: | ||||||
|     """Encode per-token tags following the BILUO scheme into Span object, e.g. |     """Encode per-token tags following the BILUO scheme into Span object, e.g. | ||||||
|     to overwrite the doc.ents. |     to overwrite the doc.ents. | ||||||
| 
 | 
 | ||||||
|  | @ -161,7 +163,9 @@ def spans_from_biluo_tags(doc, tags): | ||||||
|     return spans |     return spans | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def offsets_from_biluo_tags(doc, tags): | def biluo_tags_to_offsets( | ||||||
|  |     doc: Doc, tags: Iterable[str] | ||||||
|  | ) -> List[Tuple[int, int, Union[str, int]]]: | ||||||
|     """Encode per-token tags following the BILUO scheme into entity offsets. |     """Encode per-token tags following the BILUO scheme into entity offsets. | ||||||
| 
 | 
 | ||||||
|     doc (Doc): The document that the BILUO tags refer to. |     doc (Doc): The document that the BILUO tags refer to. | ||||||
|  | @ -172,11 +176,11 @@ def offsets_from_biluo_tags(doc, tags): | ||||||
|         `end` will be character-offset integers denoting the slice into the |         `end` will be character-offset integers denoting the slice into the | ||||||
|         original string. |         original string. | ||||||
|     """ |     """ | ||||||
|     spans = spans_from_biluo_tags(doc, tags) |     spans = biluo_tags_to_spans(doc, tags) | ||||||
|     return [(span.start_char, span.end_char, span.label_) for span in spans] |     return [(span.start_char, span.end_char, span.label_) for span in spans] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def tags_to_entities(tags): | def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]: | ||||||
|     """Note that the end index returned by this function is inclusive. |     """Note that the end index returned by this function is inclusive. | ||||||
|     To use it for Span creation, increment the end by 1.""" |     To use it for Span creation, increment the end by 1.""" | ||||||
|     entities = [] |     entities = [] | ||||||
|  | @ -209,3 +213,9 @@ def tags_to_entities(tags): | ||||||
|         else: |         else: | ||||||
|             raise ValueError(Errors.E068.format(tag=tag)) |             raise ValueError(Errors.E068.format(tag=tag)) | ||||||
|     return entities |     return entities | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Fallbacks to make backwards-compat easier | ||||||
|  | offsets_from_biluo_tags = biluo_tags_to_offsets | ||||||
|  | spans_from_biluo_tags = biluo_tags_to_spans | ||||||
|  | biluo_tags_from_offsets = offsets_to_biluo_tags | ||||||
|  |  | ||||||
|  | @ -11,9 +11,12 @@ def console_logger(): | ||||||
|     def setup_printer( |     def setup_printer( | ||||||
|         nlp: "Language", |         nlp: "Language", | ||||||
|     ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: |     ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: | ||||||
|         score_cols = list(nlp.config["training"]["score_weights"]) |         # we assume here that only components are enabled that should be trained & logged | ||||||
|  |         logged_pipes = nlp.pipe_names | ||||||
|  |         score_weights = nlp.config["training"]["score_weights"] | ||||||
|  |         score_cols = [col for col, value in score_weights.items() if value is not None] | ||||||
|         score_widths = [max(len(col), 6) for col in score_cols] |         score_widths = [max(len(col), 6) for col in score_cols] | ||||||
|         loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] |         loss_cols = [f"Loss {pipe}" for pipe in logged_pipes] | ||||||
|         loss_widths = [max(len(col), 8) for col in loss_cols] |         loss_widths = [max(len(col), 8) for col in loss_cols] | ||||||
|         table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] |         table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] | ||||||
|         table_header = [col.upper() for col in table_header] |         table_header = [col.upper() for col in table_header] | ||||||
|  | @ -26,7 +29,7 @@ def console_logger(): | ||||||
|             try: |             try: | ||||||
|                 losses = [ |                 losses = [ | ||||||
|                     "{0:.2f}".format(float(info["losses"][pipe_name])) |                     "{0:.2f}".format(float(info["losses"][pipe_name])) | ||||||
|                     for pipe_name in nlp.pipe_names |                     for pipe_name in logged_pipes | ||||||
|                 ] |                 ] | ||||||
|             except KeyError as e: |             except KeyError as e: | ||||||
|                 raise KeyError( |                 raise KeyError( | ||||||
|  | @ -38,10 +41,15 @@ def console_logger(): | ||||||
|                 ) from None |                 ) from None | ||||||
|             scores = [] |             scores = [] | ||||||
|             for col in score_cols: |             for col in score_cols: | ||||||
|                 score = float(info["other_scores"].get(col, 0.0)) |                 score = info["other_scores"].get(col, 0.0) | ||||||
|  |                 try: | ||||||
|  |                     score = float(score) | ||||||
|                     if col != "speed": |                     if col != "speed": | ||||||
|                         score *= 100 |                         score *= 100 | ||||||
|                     scores.append("{0:.2f}".format(score)) |                     scores.append("{0:.2f}".format(score)) | ||||||
|  |                 except TypeError: | ||||||
|  |                     err = Errors.E916.format(name=col, score_type=type(score)) | ||||||
|  |                     raise ValueError(err) from None | ||||||
|             data = ( |             data = ( | ||||||
|                 [info["epoch"], info["step"]] |                 [info["epoch"], info["step"]] | ||||||
|                 + losses |                 + losses | ||||||
|  |  | ||||||
|  | @ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", | ||||||
| # Default order of sections in the config.cfg. Not all sections needs to exist, | # Default order of sections in the config.cfg. Not all sections needs to exist, | ||||||
| # and additional sections are added at the end, in alphabetical order. | # and additional sections are added at the end, in alphabetical order. | ||||||
| # fmt: off | # fmt: off | ||||||
| CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"] | CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"] | ||||||
| # fmt: on | # fmt: on | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -1202,21 +1202,38 @@ def get_arg_names(func: Callable) -> List[str]: | ||||||
|     return list(set([*argspec.args, *argspec.kwonlyargs])) |     return list(set([*argspec.args, *argspec.kwonlyargs])) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]: | def combine_score_weights( | ||||||
|  |     weights: List[Dict[str, float]], | ||||||
|  |     overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(), | ||||||
|  | ) -> Dict[str, float]: | ||||||
|     """Combine and normalize score weights defined by components, e.g. |     """Combine and normalize score weights defined by components, e.g. | ||||||
|     {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}. |     {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}. | ||||||
| 
 | 
 | ||||||
|     weights (List[dict]): The weights defined by the components. |     weights (List[dict]): The weights defined by the components. | ||||||
|  |     overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that | ||||||
|  |         should be preserved. | ||||||
|     RETURNS (Dict[str, float]): The combined and normalized weights. |     RETURNS (Dict[str, float]): The combined and normalized weights. | ||||||
|     """ |     """ | ||||||
|  |     # We first need to extract all None/null values for score weights that | ||||||
|  |     # shouldn't be shown in the table *or* be weighted | ||||||
|     result = {} |     result = {} | ||||||
|  |     all_weights = [] | ||||||
|     for w_dict in weights: |     for w_dict in weights: | ||||||
|  |         filtered_weights = {} | ||||||
|  |         for key, value in w_dict.items(): | ||||||
|  |             value = overrides.get(key, value) | ||||||
|  |             if value is None: | ||||||
|  |                 result[key] = None | ||||||
|  |             else: | ||||||
|  |                 filtered_weights[key] = value | ||||||
|  |         all_weights.append(filtered_weights) | ||||||
|  |     for w_dict in all_weights: | ||||||
|         # We need to account for weights that don't sum to 1.0 and normalize |         # We need to account for weights that don't sum to 1.0 and normalize | ||||||
|         # the score weights accordingly, then divide score by the number of |         # the score weights accordingly, then divide score by the number of | ||||||
|         # components. |         # components. | ||||||
|         total = sum(w_dict.values()) |         total = sum(w_dict.values()) | ||||||
|         for key, value in w_dict.items(): |         for key, value in w_dict.items(): | ||||||
|             weight = round(value / total / len(weights), 2) |             weight = round(value / total / len(all_weights), 2) | ||||||
|             result[key] = result.get(key, 0.0) + weight |             result[key] = result.get(key, 0.0) + weight | ||||||
|     return result |     return result | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -414,7 +414,8 @@ one component. | ||||||
| > ```ini | > ```ini | ||||||
| > [model] | > [model] | ||||||
| > @architectures = "spacy.TransitionBasedParser.v1" | > @architectures = "spacy.TransitionBasedParser.v1" | ||||||
| > nr_feature_tokens = 6 | > state_type = "ner" | ||||||
|  | > extra_state_tokens = false | ||||||
| > hidden_width = 64 | > hidden_width = 64 | ||||||
| > maxout_pieces = 2 | > maxout_pieces = 2 | ||||||
| > | > | ||||||
|  | @ -447,9 +448,10 @@ consists of either two or three subnetworks: | ||||||
|   as action scores directly. |   as action scores directly. | ||||||
| 
 | 
 | ||||||
| | Name                 | Description                                                                                                                                                                                                                                                                                                                                                             | | | Name                 | Description                                                                                                                                                                                                                                                                                                                                                             | | ||||||
| | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              | | | `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              | | ||||||
| | `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~       | | | `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     | | ||||||
|  | | `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       | | ||||||
| | `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  | | | `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  | | ||||||
| | `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      | | | `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      | | ||||||
| | `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | | | `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | | ||||||
|  |  | ||||||
|  | @ -275,7 +275,7 @@ $ python -m spacy convert ./data.json ./output.spacy | ||||||
| > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes | > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes | ||||||
| > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token | > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token | ||||||
| > representing a `PERSON` entity. The | > representing a `PERSON` entity. The | ||||||
| > [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) function | > [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function | ||||||
| > can help you convert entity offsets to the right format. | > can help you convert entity offsets to the right format. | ||||||
| 
 | 
 | ||||||
| ```python | ```python | ||||||
|  |  | ||||||
|  | @ -146,15 +146,14 @@ examples, see the | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Name                    | Description                                                                                                                                                                                                                                                                                                                        | | | Name                    | Description                                                                                                                                                                                                                                                                                                                        | | ||||||
| | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `name`                  | The name of the component factory. ~~str~~                                                                                                                                                                                                                                                                                         | | | `name`                  | The name of the component factory. ~~str~~                                                                                                                                                                                                                                                                                         | | ||||||
| | _keyword-only_          |                                                                                                                                                                                                                                                                                                                                    | | | _keyword-only_          |                                                                                                                                                                                                                                                                                                                                    | | ||||||
| | `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                                                                                                     | | | `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                                                                                                     | | ||||||
| | `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 | | | `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 | | ||||||
| | `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 | | | `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 | | ||||||
| | `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                                                                                                               | | | `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                                                                                                               | | ||||||
| | `scores`                | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                     | | | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ | | ||||||
| | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ | |  | ||||||
| | `func`                  | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~                                                                                                                                                                                                                                   | | | `func`                  | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~                                                                                                                                                                                                                                   | | ||||||
| 
 | 
 | ||||||
| ## Language.\_\_call\_\_ {#call tag="method"} | ## Language.\_\_call\_\_ {#call tag="method"} | ||||||
|  | @ -1037,11 +1036,11 @@ component is defined and stored on the `Language` class for each component | ||||||
| instance and factory instance. | instance and factory instance. | ||||||
| 
 | 
 | ||||||
| | Name                    | Description                                                                                                                                                                                                                                                                                                                        | | | Name                    | Description                                                                                                                                                                                                                                                                                                                        | | ||||||
| | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `factory`               | The name of the registered component factory. ~~str~~                                                                                                                                                                                                                                                                              | | | `factory`               | The name of the registered component factory. ~~str~~                                                                                                                                                                                                                                                                              | | ||||||
| | `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                                                                                                     | | | `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                                                                                                     | | ||||||
| | `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 | | | `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 | | ||||||
| | `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 | | | `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 | | ||||||
| | `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                                                                                                               | | | `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                                                                                                               | | ||||||
| | `scores`                | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                     | | | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ | | ||||||
| | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ | | | `scores`                | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Based on the `default_score_weights` and used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                              | | ||||||
|  |  | ||||||
|  | @ -619,7 +619,7 @@ sequences in the batch. | ||||||
| 
 | 
 | ||||||
| ## Training data and alignment {#gold source="spacy/training"} | ## Training data and alignment {#gold source="spacy/training"} | ||||||
| 
 | 
 | ||||||
| ### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} | ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"} | ||||||
| 
 | 
 | ||||||
| Encode labelled spans into per-token tags, using the | Encode labelled spans into per-token tags, using the | ||||||
| [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, | [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, | ||||||
|  | @ -632,14 +632,20 @@ the beginning of a multi-token entity, `I` the inside of an entity of three or | ||||||
| more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a | more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a | ||||||
| single-token entity. | single-token entity. | ||||||
| 
 | 
 | ||||||
|  | <Infobox title="Changed in v3.0" variant="warning" id="biluo_tags_from_offsets"> | ||||||
|  | 
 | ||||||
|  | This method was previously available as `spacy.gold.biluo_tags_from_offsets`. | ||||||
|  | 
 | ||||||
|  | </Infobox> | ||||||
|  | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
| > ```python | > ```python | ||||||
| > from spacy.training import biluo_tags_from_offsets | > from spacy.training import offsets_to_biluo_tags | ||||||
| > | > | ||||||
| > doc = nlp("I like London.") | > doc = nlp("I like London.") | ||||||
| > entities = [(7, 13, "LOC")] | > entities = [(7, 13, "LOC")] | ||||||
| > tags = biluo_tags_from_offsets(doc, entities) | > tags = offsets_to_biluo_tags(doc, entities) | ||||||
| > assert tags == ["O", "O", "U-LOC", "O"] | > assert tags == ["O", "O", "U-LOC", "O"] | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
|  | @ -647,21 +653,28 @@ single-token entity. | ||||||
| | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||||
| | `doc`       | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~                                                             | | | `doc`       | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~                                                             | | ||||||
| | `entities`  | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ | | | `entities`  | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ | | ||||||
|  | | `missing`   | The label used for missing values, e.g. if tokenization doesn't align with the entity offsets. Defaults to `"O"`. ~~str~~                                                                  | | ||||||
| | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~                                                                                    | | | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~                                                                                    | | ||||||
| 
 | 
 | ||||||
| ### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} | ### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"} | ||||||
| 
 | 
 | ||||||
| Encode per-token tags following the | Encode per-token tags following the | ||||||
| [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets. | [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets. | ||||||
| 
 | 
 | ||||||
|  | <Infobox title="Changed in v3.0" variant="warning" id="offsets_from_biluo_tags"> | ||||||
|  | 
 | ||||||
|  | This method was previously available as `spacy.gold.offsets_from_biluo_tags`. | ||||||
|  | 
 | ||||||
|  | </Infobox> | ||||||
|  | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
| > ```python | > ```python | ||||||
| > from spacy.training import offsets_from_biluo_tags | > from spacy.training import biluo_tags_to_offsets | ||||||
| > | > | ||||||
| > doc = nlp("I like London.") | > doc = nlp("I like London.") | ||||||
| > tags = ["O", "O", "U-LOC", "O"] | > tags = ["O", "O", "U-LOC", "O"] | ||||||
| > entities = offsets_from_biluo_tags(doc, tags) | > entities = biluo_tags_to_offsets(doc, tags) | ||||||
| > assert entities == [(7, 13, "LOC")] | > assert entities == [(7, 13, "LOC")] | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
|  | @ -671,21 +684,27 @@ Encode per-token tags following the | ||||||
| | `entities`  | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | | `entities`  | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | ||||||
| | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~                                                                                 | | | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~                                                                                 | | ||||||
| 
 | 
 | ||||||
| ### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} | ### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"} | ||||||
| 
 | 
 | ||||||
| Encode per-token tags following the | Encode per-token tags following the | ||||||
| [BILUO scheme](/usage/linguistic-features#accessing-ner) into | [BILUO scheme](/usage/linguistic-features#accessing-ner) into | ||||||
| [`Span`](/api/span) objects. This can be used to create entity spans from | [`Span`](/api/span) objects. This can be used to create entity spans from | ||||||
| token-based tags, e.g. to overwrite the `doc.ents`. | token-based tags, e.g. to overwrite the `doc.ents`. | ||||||
| 
 | 
 | ||||||
|  | <Infobox title="Changed in v3.0" variant="warning" id="spans_from_biluo_tags"> | ||||||
|  | 
 | ||||||
|  | This method was previously available as `spacy.gold.spans_from_biluo_tags`. | ||||||
|  | 
 | ||||||
|  | </Infobox> | ||||||
|  | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
| > ```python | > ```python | ||||||
| > from spacy.training import spans_from_biluo_tags | > from spacy.training import biluo_tags_to_spans | ||||||
| > | > | ||||||
| > doc = nlp("I like London.") | > doc = nlp("I like London.") | ||||||
| > tags = ["O", "O", "U-LOC", "O"] | > tags = ["O", "O", "U-LOC", "O"] | ||||||
| > doc.ents = spans_from_biluo_tags(doc, tags) | > doc.ents = biluo_tags_to_spans(doc, tags) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Name        | Description                                                                                                                                                                                                                                                  | | | Name        | Description                                                                                                                                                                                                                                                  | | ||||||
|  |  | ||||||
|  | @ -1,24 +1,19 @@ | ||||||
| import { Help } from 'components/typography'; import Link from 'components/link' | import { Help } from 'components/typography'; import Link from 'components/link' | ||||||
| 
 | 
 | ||||||
| <!-- TODO: update, add project template --> | <!-- TODO: update numbers --> | ||||||
| 
 | 
 | ||||||
| <figure> | <figure> | ||||||
| 
 | 
 | ||||||
| | System                                                                    |            Parser |            Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> | | | Pipeline                                                   | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> | | ||||||
| | ------------------------------------------------------------------------- | ----------------: | ----------------: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | | | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | | ||||||
| | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |        |        |      |                                                                     |                                                                 6k | | | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |        |        |      |                                                                     |                                                                 6k | | ||||||
| | [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |        |        |      |                                                                     |                                                                    | | | [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |        |        |      |                                                                     |                                                                    | | ||||||
| | `en_core_web_lg` (spaCy v2)                                |   91.9 |   97.2 | 85.9 |                                                                 10k |                                                                    | | | `en_core_web_lg` (spaCy v2)                                |   91.9 |   97.2 | 85.9 |                                                                 10k |                                                                    | | ||||||
| | [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup> | _n/a_<sup>2</sup> | _n/a_<sup>2</sup> | 88.8 |                                                                 234 |                                                                 2k | |  | ||||||
| | <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link>        |                 - |              97.9 | 89.3 |                                                                     |                                                                    | |  | ||||||
| 
 | 
 | ||||||
| <figcaption class="caption"> | <figcaption class="caption"> | ||||||
| 
 | 
 | ||||||
| **Accuracy and speed on the | **Full pipeline accuracy and speed** on the | ||||||
| [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**<br />**1. ** | [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus. | ||||||
| [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). **2. ** _Coming soon_: |  | ||||||
| Qi et al. don't report parsing and tagging results on OntoNotes. We're working |  | ||||||
| on training Stanza on this corpus to allow direct comparison. |  | ||||||
| 
 | 
 | ||||||
| </figcaption> | </figcaption> | ||||||
| 
 | 
 | ||||||
|  | @ -26,18 +21,24 @@ on training Stanza on this corpus to allow direct comparison. | ||||||
| 
 | 
 | ||||||
| <figure> | <figure> | ||||||
| 
 | 
 | ||||||
| | System                                                                         |  POS |  UAS |  LAS | | | Named Entity Recognition System                                                | OntoNotes | CoNLL '03 | | ||||||
| | ------------------------------------------------------------------------------ | ---: | ---: | ---: | | | ------------------------------------------------------------------------------ | --------: | --------: | | ||||||
| | spaCy RoBERTa (2020)                                                           |      |      |      | | | spaCy RoBERTa (2020)                                                           |           |      92.2 | | ||||||
| | spaCy CNN (2020)                                                               |      |      |      | | | spaCy CNN (2020)                                                               |      85.3 |      88.4 | | ||||||
| | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 | | | spaCy CNN (2017)                                                               |      86.4 |           | | ||||||
| | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.3 | 97.2 | 95.7 | | | [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup>      |      88.8 |      92.1 | | ||||||
|  | | <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> |      89.7 |      93.1 | | ||||||
|  | | BERT Base<sup>3</sup>                                                          |         - |      92.4 | | ||||||
| 
 | 
 | ||||||
| <figcaption class="caption"> | <figcaption class="caption"> | ||||||
| 
 | 
 | ||||||
| **Accuracy on the Penn Treebank.** See | **Named entity recognition accuracy** on the | ||||||
| [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more | [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) and | ||||||
| results. | [CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See | ||||||
|  | [NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for | ||||||
|  | more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). | ||||||
|  | **2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/). **3. | ||||||
|  | ** [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805). | ||||||
| 
 | 
 | ||||||
| </figcaption> | </figcaption> | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -235,8 +235,6 @@ The `Transformer` component sets the | ||||||
| [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, | [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, | ||||||
| which lets you access the transformers outputs at runtime. | which lets you access the transformers outputs at runtime. | ||||||
| 
 | 
 | ||||||
| <!-- TODO: update/confirm once we have final models trained --> |  | ||||||
| 
 |  | ||||||
| ```cli | ```cli | ||||||
| $ python -m spacy download en_core_trf_lg | $ python -m spacy download en_core_trf_lg | ||||||
| ``` | ``` | ||||||
|  | @ -448,7 +446,8 @@ factory = "ner" | ||||||
| 
 | 
 | ||||||
| [nlp.pipeline.ner.model] | [nlp.pipeline.ner.model] | ||||||
| @architectures = "spacy.TransitionBasedParser.v1" | @architectures = "spacy.TransitionBasedParser.v1" | ||||||
| nr_feature_tokens = 3 | state_type = "ner" | ||||||
|  | extra_state_tokens = false | ||||||
| hidden_width = 128 | hidden_width = 128 | ||||||
| maxout_pieces = 3 | maxout_pieces = 3 | ||||||
| use_upper = false | use_upper = false | ||||||
|  |  | ||||||
|  | @ -61,12 +61,25 @@ import Benchmarks from 'usage/\_benchmarks-models.md' | ||||||
| 
 | 
 | ||||||
| <Benchmarks /> | <Benchmarks /> | ||||||
| 
 | 
 | ||||||
| <Project id="benchmarks/parsing_penn_treebank"> | <figure> | ||||||
| 
 | 
 | ||||||
| The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone | | Dependency Parsing System                                                      |  UAS |  LAS | | ||||||
| our project template. | | ------------------------------------------------------------------------------ | ---: | ---: | | ||||||
|  | | spaCy RoBERTa (2020)<sup>1</sup>                                               | 96.8 | 95.0 | | ||||||
|  | | spaCy CNN (2020)<sup>1</sup>                                                   | 93.7 | 91.8 | | ||||||
|  | | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 | | ||||||
|  | | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.2 | 95.7 | | ||||||
| 
 | 
 | ||||||
| </Project> | <figcaption class="caption"> | ||||||
|  | 
 | ||||||
|  | **Dependency parsing accuracy** on the Penn Treebank. See | ||||||
|  | [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more | ||||||
|  | results. **1. ** Project template: | ||||||
|  | [`benchmarks/parsing_penn_treebank`](%%GITHUB_PROJECTS/benchmarks/parsing_penn_treebank). | ||||||
|  | 
 | ||||||
|  | </figcaption> | ||||||
|  | 
 | ||||||
|  | </figure> | ||||||
| 
 | 
 | ||||||
| <!-- TODO: ## Citing spaCy {#citation} | <!-- TODO: ## Citing spaCy {#citation} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1654,9 +1654,12 @@ The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical | ||||||
| component that only provides sentence boundaries. Along with being faster and | component that only provides sentence boundaries. Along with being faster and | ||||||
| smaller than the parser, its primary advantage is that it's easier to train | smaller than the parser, its primary advantage is that it's easier to train | ||||||
| because it only requires annotated sentence boundaries rather than full | because it only requires annotated sentence boundaries rather than full | ||||||
| dependency parses. | dependency parses. spaCy's [trained pipelines](/models) include both a parser | ||||||
| 
 | and a trained sentence segmenter, which is | ||||||
| <!-- TODO: update/confirm usage once we have final models trained --> | [disabled](/usage/processing-pipelines#disabling) by default. If you only need | ||||||
|  | sentence boundaries and no parser, you can use the `enable` and `disable` | ||||||
|  | arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and | ||||||
|  | disable the parser. | ||||||
| 
 | 
 | ||||||
| > #### senter vs. parser | > #### senter vs. parser | ||||||
| > | > | ||||||
|  |  | ||||||
|  | @ -253,8 +253,6 @@ different mechanisms you can use: | ||||||
| Disabled and excluded component names can be provided to | Disabled and excluded component names can be provided to | ||||||
| [`spacy.load`](/api/top-level#spacy.load) as a list. | [`spacy.load`](/api/top-level#spacy.load) as a list. | ||||||
| 
 | 
 | ||||||
| <!-- TODO: update with info on our models shipped with optional components --> |  | ||||||
| 
 |  | ||||||
| > #### 💡 Optional pipeline components | > #### 💡 Optional pipeline components | ||||||
| > | > | ||||||
| > The `disable` mechanism makes it easy to distribute pipeline packages with | > The `disable` mechanism makes it easy to distribute pipeline packages with | ||||||
|  | @ -262,6 +260,11 @@ Disabled and excluded component names can be provided to | ||||||
| > your pipeline may include a statistical _and_ a rule-based component for | > your pipeline may include a statistical _and_ a rule-based component for | ||||||
| > sentence segmentation, and you can choose which one to run depending on your | > sentence segmentation, and you can choose which one to run depending on your | ||||||
| > use case. | > use case. | ||||||
|  | > | ||||||
|  | > For example, spaCy's [trained pipelines](/models) like | ||||||
|  | > [`en_core_web_sm`](/models/en#en_core_web_sm) contain both a `parser` and | ||||||
|  | > `senter` that perform sentence segmentation, but the `senter` is disabled by | ||||||
|  | > default. | ||||||
| 
 | 
 | ||||||
| ```python | ```python | ||||||
| # Load the pipeline without the entity recognizer | # Load the pipeline without the entity recognizer | ||||||
|  | @ -1501,7 +1504,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline | ||||||
| component function and pass it the token texts from the `Doc` object received by | component function and pass it the token texts from the `Doc` object received by | ||||||
| the component. | the component. | ||||||
| 
 | 
 | ||||||
| The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very | The [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans) is very | ||||||
| helpful here, because it takes a `Doc` object and token-based BILUO tags and | helpful here, because it takes a `Doc` object and token-based BILUO tags and | ||||||
| returns a sequence of `Span` objects in the `Doc` with added labels. So all your | returns a sequence of `Span` objects in the `Doc` with added labels. So all your | ||||||
| wrapper has to do is compute the entity spans and overwrite the `doc.ents`. | wrapper has to do is compute the entity spans and overwrite the `doc.ents`. | ||||||
|  | @ -1516,14 +1519,14 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`. | ||||||
| ```python | ```python | ||||||
| ### {highlight="1,8-9"} | ### {highlight="1,8-9"} | ||||||
| import your_custom_entity_recognizer | import your_custom_entity_recognizer | ||||||
| from spacy.training import offsets_from_biluo_tags | from spacy.training import biluo_tags_to_spans | ||||||
| from spacy.language import Language | from spacy.language import Language | ||||||
| 
 | 
 | ||||||
| @Language.component("custom_ner_wrapper") | @Language.component("custom_ner_wrapper") | ||||||
| def custom_ner_wrapper(doc): | def custom_ner_wrapper(doc): | ||||||
|     words = [token.text for token in doc] |     words = [token.text for token in doc] | ||||||
|     custom_entities = your_custom_entity_recognizer(words) |     custom_entities = your_custom_entity_recognizer(words) | ||||||
|     doc.ents = spans_from_biluo_tags(doc, custom_entities) |     doc.ents = biluo_tags_to_spans(doc, custom_entities) | ||||||
|     return doc |     return doc | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -213,7 +213,7 @@ a quick web demo. It looks pretty similar to a config file used to define CI | ||||||
| pipelines. | pipelines. | ||||||
| 
 | 
 | ||||||
| ```yaml | ```yaml | ||||||
| https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml | %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| | Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | | | Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | | ||||||
|  | @ -733,7 +733,10 @@ workflows, but only one can be tracked by DVC. | ||||||
| <Infobox title="This section is still under construction" emoji="🚧" variant="warning"> | <Infobox title="This section is still under construction" emoji="🚧" variant="warning"> | ||||||
| 
 | 
 | ||||||
| The Prodigy integration will require a nightly version of Prodigy that supports | The Prodigy integration will require a nightly version of Prodigy that supports | ||||||
| spaCy v3+. | spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by | ||||||
|  | exporting your data with | ||||||
|  | [`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running | ||||||
|  | [`spacy convert`](/api/cli#convert) to convert it to the binary format. | ||||||
| 
 | 
 | ||||||
| </Infobox> | </Infobox> | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -470,6 +470,7 @@ score. | ||||||
| ```ini | ```ini | ||||||
| [training.score_weights] | [training.score_weights] | ||||||
| dep_las = 0.4 | dep_las = 0.4 | ||||||
|  | dep_uas = null | ||||||
| ents_f = 0.4 | ents_f = 0.4 | ||||||
| tag_acc = 0.2 | tag_acc = 0.2 | ||||||
| token_acc = 0.0 | token_acc = 0.0 | ||||||
|  | @ -481,9 +482,9 @@ you generate a config for a given pipeline, the score weights are generated by | ||||||
| combining and normalizing the default score weights of the pipeline components. | combining and normalizing the default score weights of the pipeline components. | ||||||
| The default score weights are defined by each pipeline component via the | The default score weights are defined by each pipeline component via the | ||||||
| `default_score_weights` setting on the | `default_score_weights` setting on the | ||||||
| [`@Language.component`](/api/language#component) or | [`@Language.factory`](/api/language#factory) decorator. By default, all pipeline | ||||||
| [`@Language.factory`](/api/language#factory). By default, all pipeline | components are weighted equally. If a score weight is set to `null`, it will be | ||||||
| components are weighted equally. | excluded from the logs and the score won't be weighted. | ||||||
| 
 | 
 | ||||||
| <Accordion title="Understanding the training output and score types" spaced> | <Accordion title="Understanding the training output and score types" spaced> | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -88,7 +88,10 @@ import Benchmarks from 'usage/\_benchmarks-models.md' | ||||||
| - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel), | - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel), | ||||||
|   [TransformerListener](/api/architectures#TransformerListener), |   [TransformerListener](/api/architectures#TransformerListener), | ||||||
|   [Tok2VecTransformer](/api/architectures#Tok2VecTransformer) |   [Tok2VecTransformer](/api/architectures#Tok2VecTransformer) | ||||||
| - **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf) | - **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf), | ||||||
|  |   [`de_dep_news_trf`](/models/de#de_dep_news_trf), | ||||||
|  |   [`es_dep_news_trf`](/models/es#es_dep_news_trf), | ||||||
|  |   [`fr_dep_news_trf`](/models/fr#fr_dep_news_trf) | ||||||
| - **Implementation:** | - **Implementation:** | ||||||
|   [`spacy-transformers`](https://github.com/explosion/spacy-transformers) |   [`spacy-transformers`](https://github.com/explosion/spacy-transformers) | ||||||
| 
 | 
 | ||||||
|  | @ -549,12 +552,14 @@ Note that spaCy v3.0 now requires **Python 3.6+**. | ||||||
| ### Removed or renamed API {#incompat-removed} | ### Removed or renamed API {#incompat-removed} | ||||||
| 
 | 
 | ||||||
| | Removed                                                                                      | Replacement                                                                                                                                                                                                              | | | Removed                                                                                      | Replacement                                                                                                                                                                                                              | | ||||||
| | -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | | | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||||
| | `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe)                                                                                                             | | | `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe)                                                                                                             | | ||||||
|  | | `Doc.is_tagged`, `Doc.is_parsed`, ...                                                        | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                                                                                                          | | ||||||
| | `GoldParse`                                                                                  | [`Example`](/api/example)                                                                                                                                                                                                | | | `GoldParse`                                                                                  | [`Example`](/api/example)                                                                                                                                                                                                | | ||||||
| | `GoldCorpus`                                                                                 | [`Corpus`](/api/corpus)                                                                                                                                                                                                  | | | `GoldCorpus`                                                                                 | [`Corpus`](/api/corpus)                                                                                                                                                                                                  | | ||||||
| | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`                                              | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                                                                                                                               | | | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`                                              | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                                                                                                                               | | ||||||
| | `Matcher.pipe`, `PhraseMatcher.pipe`                                                         | not needed                                                                                                                                                                                                               | | | `Matcher.pipe`, `PhraseMatcher.pipe`                                                         | not needed                                                                                                                                                                                                               | | ||||||
|  | | `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) | | ||||||
| | `spacy init-model`                                                                           | [`spacy init vocab`](/api/cli#init-vocab)                                                                                                                                                                                | | | `spacy init-model`                                                                           | [`spacy init vocab`](/api/cli#init-vocab)                                                                                                                                                                                | | ||||||
| | `spacy debug-data`                                                                           | [`spacy debug data`](/api/cli#debug-data)                                                                                                                                                                                | | | `spacy debug-data`                                                                           | [`spacy debug data`](/api/cli#debug-data)                                                                                                                                                                                | | ||||||
| | `spacy profile`                                                                              | [`spacy debug profile`](/api/cli#debug-profile)                                                                                                                                                                          | | | `spacy profile`                                                                              | [`spacy debug profile`](/api/cli#debug-profile)                                                                                                                                                                          | | ||||||
|  | @ -968,16 +973,17 @@ python -m spacy package ./output ./packages | ||||||
| 
 | 
 | ||||||
| #### Data utilities and gold module {#migrating-gold} | #### Data utilities and gold module {#migrating-gold} | ||||||
| 
 | 
 | ||||||
| The `spacy.gold` module has been renamed to `spacy.training`. This mostly | The `spacy.gold` module has been renamed to `spacy.training` and the conversion | ||||||
| affects internals, but if you've been using the span offset conversion utilities | utilities now follow the naming format of `x_to_y`. This mostly affects | ||||||
| [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets), | internals, but if you've been using the span offset conversion utilities | ||||||
| [`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or | [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags), | ||||||
| [`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to | [`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or | ||||||
| change your imports: | [`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to | ||||||
|  | change your names and imports: | ||||||
| 
 | 
 | ||||||
| ```diff | ```diff | ||||||
| - from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags | - from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, spans_from_biluo_tags | ||||||
| + from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags | + from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, biluo_tags_to_spans | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| #### Migration notes for plugin maintainers {#migrating-plugins} | #### Migration notes for plugin maintainers {#migrating-plugins} | ||||||
|  |  | ||||||
|  | @ -24,6 +24,7 @@ const branch = isNightly ? 'develop' : 'master' | ||||||
| // Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY
 | // Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY
 | ||||||
| const replacements = { | const replacements = { | ||||||
|     GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`, |     GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`, | ||||||
|  |     GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /** | /** | ||||||
|  |  | ||||||
|  | @ -1,21 +1,11 @@ | ||||||
| { | { | ||||||
|     "languages": [ |     "languages": [ | ||||||
|         { |         { "code": "af", "name": "Afrikaans" }, | ||||||
|             "code": "zh", |         { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true }, | ||||||
|             "name": "Chinese", |         { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true }, | ||||||
|             "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], |         { "code": "bn", "name": "Bengali", "has_examples": true }, | ||||||
|             "dependencies": [ |         { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, | ||||||
|                 { |         { "code": "cs", "name": "Czech", "has_examples": true }, | ||||||
|                     "name": "Jieba", |  | ||||||
|                     "url": "https://github.com/fxsjy/jieba" |  | ||||||
|                 }, |  | ||||||
|                 { |  | ||||||
|                     "name": "PKUSeg", |  | ||||||
|                     "url": "https://github.com/lancopku/PKUSeg-python" |  | ||||||
|                 } |  | ||||||
|             ], |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |  | ||||||
|         { |         { | ||||||
|             "code": "da", |             "code": "da", | ||||||
|             "name": "Danish", |             "name": "Danish", | ||||||
|  | @ -23,39 +13,10 @@ | ||||||
|             "has_examples": true, |             "has_examples": true, | ||||||
|             "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"] |             "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"] | ||||||
|         }, |         }, | ||||||
|         { |  | ||||||
|             "code": "nl", |  | ||||||
|             "name": "Dutch", |  | ||||||
|             "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], |  | ||||||
|             "example": "Dit is een zin.", |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |  | ||||||
|         { |  | ||||||
|             "code": "en", |  | ||||||
|             "name": "English", |  | ||||||
|             "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"], |  | ||||||
|             "starters": [ |  | ||||||
|                 "en_vectors_web_lg", |  | ||||||
|                 "en_trf_bertbaseuncased_lg", |  | ||||||
|                 "en_trf_robertabase_lg", |  | ||||||
|                 "en_trf_distilbertbaseuncased_lg", |  | ||||||
|                 "en_trf_xlnetbasecased_lg" |  | ||||||
|             ], |  | ||||||
|             "example": "This is a sentence.", |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |  | ||||||
|         { |  | ||||||
|             "code": "fr", |  | ||||||
|             "name": "French", |  | ||||||
|             "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"], |  | ||||||
|             "example": "C'est une phrase.", |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |  | ||||||
|         { |         { | ||||||
|             "code": "de", |             "code": "de", | ||||||
|             "name": "German", |             "name": "German", | ||||||
|             "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"], |             "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"], | ||||||
|             "starters": ["de_trf_bertbasecased_lg"], |  | ||||||
|             "example": "Dies ist ein Satz.", |             "example": "Dies ist ein Satz.", | ||||||
|             "has_examples": true |             "has_examples": true | ||||||
|         }, |         }, | ||||||
|  | @ -66,6 +27,46 @@ | ||||||
|             "example": "Αυτή είναι μια πρόταση.", |             "example": "Αυτή είναι μια πρόταση.", | ||||||
|             "has_examples": true |             "has_examples": true | ||||||
|         }, |         }, | ||||||
|  |         { | ||||||
|  |             "code": "en", | ||||||
|  |             "name": "English", | ||||||
|  |             "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"], | ||||||
|  |             "starters": ["en_vectors_web_lg"], | ||||||
|  |             "example": "This is a sentence.", | ||||||
|  |             "has_examples": true | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |             "code": "es", | ||||||
|  |             "name": "Spanish", | ||||||
|  |             "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"], | ||||||
|  |             "example": "Esto es una frase.", | ||||||
|  |             "has_examples": true | ||||||
|  |         }, | ||||||
|  |         { "code": "et", "name": "Estonian" }, | ||||||
|  |         { "code": "eu", "name": "Basque", "has_examples": true }, | ||||||
|  |         { "code": "fa", "name": "Persian", "has_examples": true }, | ||||||
|  |         { "code": "fi", "name": "Finnish", "has_examples": true }, | ||||||
|  |         { | ||||||
|  |             "code": "fr", | ||||||
|  |             "name": "French", | ||||||
|  |             "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"], | ||||||
|  |             "example": "C'est une phrase.", | ||||||
|  |             "has_examples": true | ||||||
|  |         }, | ||||||
|  |         { "code": "ga", "name": "Irish" }, | ||||||
|  |         { "code": "gu", "name": "Gujarati", "has_examples": true }, | ||||||
|  |         { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true }, | ||||||
|  |         { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true }, | ||||||
|  |         { "code": "hr", "name": "Croatian", "has_examples": true }, | ||||||
|  |         { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, | ||||||
|  |         { "code": "hy", "name": "Armenian", "has_examples": true }, | ||||||
|  |         { | ||||||
|  |             "code": "id", | ||||||
|  |             "name": "Indonesian", | ||||||
|  |             "example": "Ini adalah sebuah kalimat.", | ||||||
|  |             "has_examples": true | ||||||
|  |         }, | ||||||
|  |         { "code": "is", "name": "Icelandic" }, | ||||||
|         { |         { | ||||||
|             "code": "it", |             "code": "it", | ||||||
|             "name": "Italian", |             "name": "Italian", | ||||||
|  | @ -88,12 +89,37 @@ | ||||||
|             "example": "これは文章です。", |             "example": "これは文章です。", | ||||||
|             "has_examples": true |             "has_examples": true | ||||||
|         }, |         }, | ||||||
|  |         { "code": "kn", "name": "Kannada", "has_examples": true }, | ||||||
|  |         { | ||||||
|  |             "code": "ko", | ||||||
|  |             "name": "Korean", | ||||||
|  |             "dependencies": [ | ||||||
|  |                 { | ||||||
|  |                     "name": "mecab-ko", | ||||||
|  |                     "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" | ||||||
|  |                 }, | ||||||
|  |                 { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" }, | ||||||
|  |                 { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" } | ||||||
|  |             ], | ||||||
|  |             "example": "이것은 문장입니다.", | ||||||
|  |             "has_examples": true | ||||||
|  |         }, | ||||||
|  |         { "code": "lb", "name": "Luxembourgish", "has_examples": true }, | ||||||
|  |         { | ||||||
|  |             "code": "lij", | ||||||
|  |             "name": "Ligurian", | ||||||
|  |             "example": "Sta chì a l'é unna fraxe.", | ||||||
|  |             "has_examples": true | ||||||
|  |         }, | ||||||
|         { |         { | ||||||
|             "code": "lt", |             "code": "lt", | ||||||
|             "name": "Lithuanian", |             "name": "Lithuanian", | ||||||
|             "has_examples": true, |             "has_examples": true, | ||||||
|             "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"] |             "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"] | ||||||
|         }, |         }, | ||||||
|  |         { "code": "lv", "name": "Latvian" }, | ||||||
|  |         { "code": "ml", "name": "Malayalam", "has_examples": true }, | ||||||
|  |         { "code": "mr", "name": "Marathi" }, | ||||||
|         { |         { | ||||||
|             "code": "nb", |             "code": "nb", | ||||||
|             "name": "Norwegian Bokmål", |             "name": "Norwegian Bokmål", | ||||||
|  | @ -101,6 +127,14 @@ | ||||||
|             "has_examples": true, |             "has_examples": true, | ||||||
|             "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"] |             "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"] | ||||||
|         }, |         }, | ||||||
|  |         { "code": "ne", "name": "Nepali", "has_examples": true }, | ||||||
|  |         { | ||||||
|  |             "code": "nl", | ||||||
|  |             "name": "Dutch", | ||||||
|  |             "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], | ||||||
|  |             "example": "Dit is een zin.", | ||||||
|  |             "has_examples": true | ||||||
|  |         }, | ||||||
|         { |         { | ||||||
|             "code": "pl", |             "code": "pl", | ||||||
|             "name": "Polish", |             "name": "Polish", | ||||||
|  | @ -122,69 +156,26 @@ | ||||||
|             "has_examples": true, |             "has_examples": true, | ||||||
|             "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"] |             "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"] | ||||||
|         }, |         }, | ||||||
|         { |  | ||||||
|             "code": "es", |  | ||||||
|             "name": "Spanish", |  | ||||||
|             "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"], |  | ||||||
|             "example": "Esto es una frase.", |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |  | ||||||
|         { "code": "sv", "name": "Swedish", "has_examples": true }, |  | ||||||
|         { "code": "fi", "name": "Finnish", "has_examples": true }, |  | ||||||
|         { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, |  | ||||||
|         { |         { | ||||||
|             "code": "ru", |             "code": "ru", | ||||||
|             "name": "Russian", |             "name": "Russian", | ||||||
|             "has_examples": true, |             "has_examples": true, | ||||||
|             "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] |             "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] | ||||||
|         }, |         }, | ||||||
|         { |         { "code": "sa", "name": "Sanskrit", "has_examples": true }, | ||||||
|             "code": "uk", |  | ||||||
|             "name": "Ukrainian", |  | ||||||
|             "has_examples": true, |  | ||||||
|             "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] |  | ||||||
|         }, |  | ||||||
|         { "code": "hr", "name": "Croatian", "has_examples": true }, |  | ||||||
|         { "code": "eu", "name": "Basque", "has_examples": true }, |  | ||||||
|         { "code": "yo", "name": "Yoruba", "has_examples": true }, |  | ||||||
|         { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, |  | ||||||
|         { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true }, |  | ||||||
|         { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true }, |  | ||||||
|         { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true }, |  | ||||||
|         { "code": "fa", "name": "Persian", "has_examples": true }, |  | ||||||
|         { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true }, |  | ||||||
|         { "code": "tt", "name": "Tatar", "has_examples": true }, |  | ||||||
|         { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true }, |  | ||||||
|         { "code": "si", "name": "Sinhala", "example": "මෙය වාක්යයකි.", "has_examples": true }, |         { "code": "si", "name": "Sinhala", "example": "මෙය වාක්යයකි.", "has_examples": true }, | ||||||
|         { "code": "ga", "name": "Irish" }, |         { "code": "sk", "name": "Slovak", "has_examples": true }, | ||||||
|         { "code": "bn", "name": "Bengali", "has_examples": true }, |  | ||||||
|         { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true }, |  | ||||||
|         { "code": "mr", "name": "Marathi" }, |  | ||||||
|         { "code": "kn", "name": "Kannada" }, |  | ||||||
|         { "code": "ta", "name": "Tamil", "has_examples": true }, |  | ||||||
|         { |  | ||||||
|             "code": "id", |  | ||||||
|             "name": "Indonesian", |  | ||||||
|             "example": "Ini adalah sebuah kalimat.", |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |  | ||||||
|         { "code": "tl", "name": "Tagalog" }, |  | ||||||
|         { "code": "af", "name": "Afrikaans" }, |  | ||||||
|         { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true }, |  | ||||||
|         { "code": "cs", "name": "Czech" }, |  | ||||||
|         { "code": "is", "name": "Icelandic" }, |  | ||||||
|         { "code": "lv", "name": "Latvian" }, |  | ||||||
|         { "code": "sr", "name": "Serbian" }, |  | ||||||
|         { "code": "sk", "name": "Slovak" }, |  | ||||||
|         { "code": "sl", "name": "Slovenian" }, |         { "code": "sl", "name": "Slovenian" }, | ||||||
|         { "code": "lb", "name": "Luxembourgish" }, |  | ||||||
|         { |         { | ||||||
|             "code": "sq", |             "code": "sq", | ||||||
|             "name": "Albanian", |             "name": "Albanian", | ||||||
|             "example": "Kjo është një fjali.", |             "example": "Kjo është një fjali.", | ||||||
|             "has_examples": true |             "has_examples": true | ||||||
|         }, |         }, | ||||||
|         { "code": "et", "name": "Estonian" }, |         { "code": "sr", "name": "Serbian", "has_examples": true }, | ||||||
|  |         { "code": "sv", "name": "Swedish", "has_examples": true }, | ||||||
|  |         { "code": "ta", "name": "Tamil", "has_examples": true }, | ||||||
|  |         { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true }, | ||||||
|         { |         { | ||||||
|             "code": "th", |             "code": "th", | ||||||
|             "name": "Thai", |             "name": "Thai", | ||||||
|  | @ -194,51 +185,43 @@ | ||||||
|             "example": "นี่คือประโยค", |             "example": "นี่คือประโยค", | ||||||
|             "has_examples": true |             "has_examples": true | ||||||
|         }, |         }, | ||||||
|  |         { "code": "tl", "name": "Tagalog" }, | ||||||
|  |         { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, | ||||||
|  |         { "code": "tt", "name": "Tatar", "has_examples": true }, | ||||||
|         { |         { | ||||||
|             "code": "ko", |             "code": "uk", | ||||||
|             "name": "Korean", |             "name": "Ukrainian", | ||||||
|             "dependencies": [ |             "has_examples": true, | ||||||
|                 { |             "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] | ||||||
|                     "name": "mecab-ko", |  | ||||||
|                     "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" |  | ||||||
|                 }, |  | ||||||
|                 { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" }, |  | ||||||
|                 { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" } |  | ||||||
|             ], |  | ||||||
|             "example": "이것은 문장입니다.", |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |         }, | ||||||
|  |         { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true }, | ||||||
|         { |         { | ||||||
|             "code": "vi", |             "code": "vi", | ||||||
|             "name": "Vietnamese", |             "name": "Vietnamese", | ||||||
|             "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }] |             "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }] | ||||||
|         }, |         }, | ||||||
|         { |  | ||||||
|             "code": "lij", |  | ||||||
|             "name": "Ligurian", |  | ||||||
|             "example": "Sta chì a l'é unna fraxe.", |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |  | ||||||
|         { |  | ||||||
|             "code": "hy", |  | ||||||
|             "name": "Armenian", |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |  | ||||||
|         { |  | ||||||
|             "code": "gu", |  | ||||||
|             "name": "Gujarati", |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |  | ||||||
|         { |  | ||||||
|             "code": "ml", |  | ||||||
|             "name": "Malayalam", |  | ||||||
|             "has_examples": true |  | ||||||
|         }, |  | ||||||
|         { |         { | ||||||
|             "code": "xx", |             "code": "xx", | ||||||
|             "name": "Multi-language", |             "name": "Multi-language", | ||||||
|             "models": ["xx_ent_wiki_sm"], |             "models": ["xx_ent_wiki_sm"], | ||||||
|             "example": "This is a sentence about Facebook." |             "example": "This is a sentence about Facebook." | ||||||
|  |         }, | ||||||
|  |         { "code": "yo", "name": "Yoruba", "has_examples": true }, | ||||||
|  |         { | ||||||
|  |             "code": "zh", | ||||||
|  |             "name": "Chinese", | ||||||
|  |             "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], | ||||||
|  |             "dependencies": [ | ||||||
|  |                 { | ||||||
|  |                     "name": "Jieba", | ||||||
|  |                     "url": "https://github.com/fxsjy/jieba" | ||||||
|  |                 }, | ||||||
|  |                 { | ||||||
|  |                     "name": "PKUSeg", | ||||||
|  |                     "url": "https://github.com/lancopku/PKUSeg-python" | ||||||
|  |                 } | ||||||
|  |             ], | ||||||
|  |             "has_examples": true | ||||||
|         } |         } | ||||||
|     ], |     ], | ||||||
|     "licenses": [ |     "licenses": [ | ||||||
|  |  | ||||||
|  | @ -1,4 +1,4 @@ | ||||||
| import React from 'react' | import React, { Fragment } from 'react' | ||||||
| import PropTypes from 'prop-types' | import PropTypes from 'prop-types' | ||||||
| import classNames from 'classnames' | import classNames from 'classnames' | ||||||
| 
 | 
 | ||||||
|  | @ -14,13 +14,16 @@ export default function Infobox({ | ||||||
|     className, |     className, | ||||||
|     children, |     children, | ||||||
| }) { | }) { | ||||||
|  |     const Wrapper = id ? 'div' : Fragment | ||||||
|     const infoboxClassNames = classNames(classes.root, className, { |     const infoboxClassNames = classNames(classes.root, className, { | ||||||
|         [classes.list]: !!list, |         [classes.list]: !!list, | ||||||
|         [classes.warning]: variant === 'warning', |         [classes.warning]: variant === 'warning', | ||||||
|         [classes.danger]: variant === 'danger', |         [classes.danger]: variant === 'danger', | ||||||
|     }) |     }) | ||||||
|     return ( |     return ( | ||||||
|         <aside className={infoboxClassNames} id={id}> |         <Wrapper> | ||||||
|  |             {id && <a id={id} />} | ||||||
|  |             <aside className={infoboxClassNames}> | ||||||
|                 {title && ( |                 {title && ( | ||||||
|                     <h4 className={classes.title}> |                     <h4 className={classes.title}> | ||||||
|                         {variant !== 'default' && !emoji && ( |                         {variant !== 'default' && !emoji && ( | ||||||
|  | @ -38,6 +41,7 @@ export default function Infobox({ | ||||||
|                 )} |                 )} | ||||||
|                 {children} |                 {children} | ||||||
|             </aside> |             </aside> | ||||||
|  |         </Wrapper> | ||||||
|     ) |     ) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -12,7 +12,6 @@ import Tag from '../components/tag' | ||||||
| import { H2, Label } from '../components/typography' | import { H2, Label } from '../components/typography' | ||||||
| import Icon from '../components/icon' | import Icon from '../components/icon' | ||||||
| import Link from '../components/link' | import Link from '../components/link' | ||||||
| import Grid from '../components/grid' |  | ||||||
| import Infobox from '../components/infobox' | import Infobox from '../components/infobox' | ||||||
| import Accordion from '../components/accordion' | import Accordion from '../components/accordion' | ||||||
| import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' | import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' | ||||||
|  | @ -31,10 +30,16 @@ const MODEL_META = { | ||||||
|     wiki: 'Wikipedia', |     wiki: 'Wikipedia', | ||||||
|     uas: 'Unlabelled dependencies', |     uas: 'Unlabelled dependencies', | ||||||
|     las: 'Labelled dependencies', |     las: 'Labelled dependencies', | ||||||
|  |     token_acc: 'Tokenization', | ||||||
|  |     tok: 'Tokenization', | ||||||
|     tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', |     tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', | ||||||
|     ents_f: 'Entities (F-score)', |     tag: 'Part-of-speech tags (fine grained tags, Token.tag)', | ||||||
|     ents_p: 'Entities (precision)', |     ents_f: 'Named entities (F-score)', | ||||||
|     ents_r: 'Entities (recall)', |     ents_p: 'Named entities (precision)', | ||||||
|  |     ents_r: 'Named entities (recall)', | ||||||
|  |     sent_f: 'Sentence segmentation (F-score)', | ||||||
|  |     sent_p: 'Sentence segmentation (precision)', | ||||||
|  |     sent_r: 'Sentence segmentation (recall)', | ||||||
|     cpu: 'words per second on CPU', |     cpu: 'words per second on CPU', | ||||||
|     gpu: 'words per second on GPU', |     gpu: 'words per second on GPU', | ||||||
|     pipeline: 'Active processing pipeline components in order', |     pipeline: 'Active processing pipeline components in order', | ||||||
|  | @ -83,25 +88,19 @@ function formatVectors(data) { | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| function formatAccuracy(data) { | function formatAccuracy(data) { | ||||||
|     if (!data) return null |     if (!data) return [] | ||||||
|     const labels = { |  | ||||||
|         las: 'LAS', |  | ||||||
|         uas: 'UAS', |  | ||||||
|         tags_acc: 'TAG', |  | ||||||
|         ents_f: 'NER F', |  | ||||||
|         ents_p: 'NER P', |  | ||||||
|         ents_r: 'NER R', |  | ||||||
|     } |  | ||||||
|     const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) |  | ||||||
|     const isNer = key => key.startsWith('ents_') |  | ||||||
|     return Object.keys(data) |     return Object.keys(data) | ||||||
|         .filter(key => labels[key]) |         .map(label => { | ||||||
|         .map(key => ({ |             const value = data[label] | ||||||
|             label: labels[key], |             return isNaN(value) | ||||||
|             value: data[key].toFixed(2), |                 ? null | ||||||
|             help: MODEL_META[key], |                 : { | ||||||
|             type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, |                       label, | ||||||
|         })) |                       value: value.toFixed(2), | ||||||
|  |                       help: MODEL_META[label], | ||||||
|  |                   } | ||||||
|  |         }) | ||||||
|  |         .filter(item => item) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| function formatModelMeta(data) { | function formatModelMeta(data) { | ||||||
|  | @ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl | ||||||
|         { label: 'Author', content: author }, |         { label: 'Author', content: author }, | ||||||
|         { label: 'License', content: license }, |         { label: 'License', content: license }, | ||||||
|     ] |     ] | ||||||
|     const accuracy = [ |  | ||||||
|         { |  | ||||||
|             label: 'Syntax Accuracy', |  | ||||||
|             items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null, |  | ||||||
|         }, |  | ||||||
|         { |  | ||||||
|             label: 'NER Accuracy', |  | ||||||
|             items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null, |  | ||||||
|         }, |  | ||||||
|     ] |  | ||||||
| 
 | 
 | ||||||
|     const error = ( |     const error = ( | ||||||
|         <Infobox title="Unable to load model details from GitHub" variant="danger"> |         <Infobox title="Unable to load model details from GitHub" variant="danger"> | ||||||
|  | @ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl | ||||||
|             </p> |             </p> | ||||||
|         </Infobox> |         </Infobox> | ||||||
|     ) |     ) | ||||||
| 
 |  | ||||||
|     return ( |     return ( | ||||||
|         <Section id={name}> |         <Section id={name}> | ||||||
|             <H2 |             <H2 | ||||||
|  | @ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl | ||||||
|                     )} |                     )} | ||||||
|                 </tbody> |                 </tbody> | ||||||
|             </Table> |             </Table> | ||||||
|             <Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}> |  | ||||||
|                 {accuracy && |  | ||||||
|                     accuracy.map(({ label, items }, i) => |  | ||||||
|                         !items ? null : ( |  | ||||||
|                             <Table fixed key={i}> |  | ||||||
|                                 <thead> |  | ||||||
|                                     <Tr> |  | ||||||
|                                         <Th colSpan={2}>{label}</Th> |  | ||||||
|                                     </Tr> |  | ||||||
|                                 </thead> |  | ||||||
|                                 <tbody> |  | ||||||
|                                     {items.map((item, i) => ( |  | ||||||
|                                         <Tr key={i}> |  | ||||||
|                                             <Td> |  | ||||||
|                                                 <Label> |  | ||||||
|                                                     {item.label}{' '} |  | ||||||
|                                                     {item.help && <Help>{item.help}</Help>} |  | ||||||
|                                                 </Label> |  | ||||||
|                                             </Td> |  | ||||||
|                                             <Td num>{item.value}</Td> |  | ||||||
|                                         </Tr> |  | ||||||
|                                     ))} |  | ||||||
|                                 </tbody> |  | ||||||
|                             </Table> |  | ||||||
|                         ) |  | ||||||
|                     )} |  | ||||||
|             </Grid> |  | ||||||
|             {meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)} |             {meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)} | ||||||
|             {hasInteractiveCode && ( |             {hasInteractiveCode && ( | ||||||
|                 <CodeBlock title="Try out the model" lang="python" executable={true}> |                 <CodeBlock title="Try out the model" lang="python" executable={true}> | ||||||
|  | @ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl | ||||||
|                         `import spacy`, |                         `import spacy`, | ||||||
|                         `from spacy.lang.${langId}.examples import sentences `, |                         `from spacy.lang.${langId}.examples import sentences `, | ||||||
|                         ``, |                         ``, | ||||||
|                         `nlp = spacy.load('${name}')`, |                         `nlp = spacy.load("${name}")`, | ||||||
|                         `doc = nlp(sentences[0])`, |                         `doc = nlp(sentences[0])`, | ||||||
|                         `print(doc.text)`, |                         `print(doc.text)`, | ||||||
|                         `for token in doc:`, |                         `for token in doc:`, | ||||||
|  | @ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl | ||||||
|                     ].join('\n')} |                     ].join('\n')} | ||||||
|                 </CodeBlock> |                 </CodeBlock> | ||||||
|             )} |             )} | ||||||
|  |             {meta.accuracy && ( | ||||||
|  |                 <Accordion id={`${name}-accuracy`} title="Accuracy Evaluation"> | ||||||
|  |                     <Table> | ||||||
|  |                         <tbody> | ||||||
|  |                             {meta.accuracy.map(({ label, value, help }) => ( | ||||||
|  |                                 <Tr key={`${name}-${label}`}> | ||||||
|  |                                     <Td nowrap> | ||||||
|  |                                         <InlineCode>{label.toUpperCase()}</InlineCode> | ||||||
|  |                                     </Td> | ||||||
|  |                                     <Td>{help}</Td> | ||||||
|  |                                     <Td num style={{ textAlign: 'right' }}> | ||||||
|  |                                         {value} | ||||||
|  |                                     </Td> | ||||||
|  |                                 </Tr> | ||||||
|  |                             ))} | ||||||
|  |                         </tbody> | ||||||
|  |                     </Table> | ||||||
|  |                 </Accordion> | ||||||
|  |             )} | ||||||
|             {labels && ( |             {labels && ( | ||||||
|                 <Accordion id={`${name}-labels`} title="Label Scheme"> |                 <Accordion id={`${name}-labels`} title="Label Scheme"> | ||||||
|                     <p> |                     <p> | ||||||
|  | @ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl | ||||||
|                                 const labelNames = labels[pipe] || [] |                                 const labelNames = labels[pipe] || [] | ||||||
|                                 const help = LABEL_SCHEME_META[pipe] |                                 const help = LABEL_SCHEME_META[pipe] | ||||||
|                                 return ( |                                 return ( | ||||||
|                                     <Tr key={pipe} evenodd={false} key={pipe}> |                                     <Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}> | ||||||
|                                         <Td style={{ width: '20%' }}> |                                         <Td style={{ width: '20%' }}> | ||||||
|                                             <Label> |                                             <Label> | ||||||
|                                                 {pipe} {help && <Help>{help}</Help>} |                                                 {pipe} {help && <Help>{help}</Help>} | ||||||
|  | @ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl | ||||||
| const Models = ({ pageContext, repo, children }) => { | const Models = ({ pageContext, repo, children }) => { | ||||||
|     const [initialized, setInitialized] = useState(false) |     const [initialized, setInitialized] = useState(false) | ||||||
|     const [compatibility, setCompatibility] = useState({}) |     const [compatibility, setCompatibility] = useState({}) | ||||||
|     const { id, title, meta } = pageContext |     const { id, title, meta, hasExamples } = pageContext | ||||||
|     const { models, isStarters } = meta |     const { models, isStarters } = meta | ||||||
|     const baseUrl = `https://raw.githubusercontent.com/${repo}/master` |     const baseUrl = `https://raw.githubusercontent.com/${repo}/master` | ||||||
| 
 | 
 | ||||||
|  | @ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => { | ||||||
| 
 | 
 | ||||||
|     const modelTitle = title |     const modelTitle = title | ||||||
|     const modelTeaser = `Available trained pipelines for ${title}` |     const modelTeaser = `Available trained pipelines for ${title}` | ||||||
| 
 |  | ||||||
|     const starterTitle = `${title} starters` |     const starterTitle = `${title} starters` | ||||||
|     const starterTeaser = `Available transfer learning starter packs for ${title}` |     const starterTeaser = `Available transfer learning starter packs for ${title}` | ||||||
| 
 | 
 | ||||||
|  | @ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => { | ||||||
|                             baseUrl={baseUrl} |                             baseUrl={baseUrl} | ||||||
|                             repo={repo} |                             repo={repo} | ||||||
|                             licenses={arrayToObj(site.siteMetadata.licenses, 'id')} |                             licenses={arrayToObj(site.siteMetadata.licenses, 'id')} | ||||||
|  |                             hasExamples={meta.hasExamples} | ||||||
|                         /> |                         /> | ||||||
|                     )) |                     )) | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|  | @ -297,7 +297,7 @@ const Landing = ({ data }) => { | ||||||
|                         to run. |                         to run. | ||||||
|                     </p> |                     </p> | ||||||
|                     <p> |                     <p> | ||||||
|                         <Button to="/usage/facts-figures#benchmarks">See details</Button> |                         <Button to="/usage/facts-figures#benchmarks">More results</Button> | ||||||
|                     </p> |                     </p> | ||||||
|                 </LandingCol> |                 </LandingCol> | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -22,7 +22,7 @@ const Language = ({ name, code, models }) => ( | ||||||
|         <Td> |         <Td> | ||||||
|             {models && models.length ? ( |             {models && models.length ? ( | ||||||
|                 <Link to={`/models/${code}`}> |                 <Link to={`/models/${code}`}> | ||||||
|                     {models.length} {models.length === 1 ? 'model' : 'models'} |                     {models.length} {models.length === 1 ? 'package' : 'packages'} | ||||||
|                 </Link> |                 </Link> | ||||||
|             ) : ( |             ) : ( | ||||||
|                 <em>none yet</em> |                 <em>none yet</em> | ||||||
|  | @ -51,7 +51,7 @@ const Languages = () => ( | ||||||
|                                 <Th>Language</Th> |                                 <Th>Language</Th> | ||||||
|                                 <Th>Code</Th> |                                 <Th>Code</Th> | ||||||
|                                 <Th>Language Data</Th> |                                 <Th>Language Data</Th> | ||||||
|                                 <Th>Models</Th> |                                 <Th>Pipelines</Th> | ||||||
|                             </Tr> |                             </Tr> | ||||||
|                         </thead> |                         </thead> | ||||||
|                         <tbody> |                         <tbody> | ||||||
|  |  | ||||||
|  | @ -16,7 +16,8 @@ export default function Project({ | ||||||
| }) { | }) { | ||||||
|     const repoArg = repo ? ` --repo ${repo}` : '' |     const repoArg = repo ? ` --repo ${repo}` : '' | ||||||
|     const text = `${COMMAND} ${id}${repoArg}` |     const text = `${COMMAND} ${id}${repoArg}` | ||||||
|     const url = `${repo || projectsRepo}/${id}` |     const defaultRepo = `https://github.com/${projectsRepo}` | ||||||
|  |     const url = `${repo || defaultRepo}/${id}` | ||||||
|     const header = ( |     const header = ( | ||||||
|         <> |         <> | ||||||
|             {title}:{' '} |             {title}:{' '} | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user